# HS2 Tweets
### Importing Packages

In [1]:
import tweepy as tw
import pprint
import numpy as np
import nltk
import bokeh
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

from collections import Counter

from wordcloud import WordCloud

from bokeh.io import output_notebook, show, reset_output
from bokeh.plotting import figure
from bokeh.models import HoverTool
from bokeh.models import ColumnDataSource
from bokeh.palettes import Plasma256
from bokeh.models import Range1d

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize

from textblob import Word, TextBlob

### Twitter API Access & Set-Up

In [2]:
file = open(r"C:\Users\yeungf8452\OneDrive - ARCADIS\HS2 (Data Science)\HS2 Twitter\twitter_api.txt", "r").readlines()

api = file[0].strip('\n')
api_secret = file[1].strip('\n')
bearer_token = file[2].strip('\n')
access_token = file[3].strip('\n')
access_token_secret = file[4].strip('\n')

In [3]:
auth = tw.OAuthHandler(api, api_secret)
auth.set_access_token(access_token, access_token_secret)
api_app = tw.API(auth, wait_on_rate_limit = True)

### Query

In [7]:
client = tw.Client(bearer_token)

hashtag = "#HS2 OR #hs2 lang:en"
tweet_fields=['context_annotations', 'created_at', 'entities', 'geo', 'public_metrics']
user_fields = ['location', 'public_metrics']
# user fields: https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/user
expansions=['geo.place_id', 'author_id', 'referenced_tweets.id']
place_fields=['name', 'geo']
max_results = 10
# place fields: https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/place

# For search options: https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query
# Only can use operators that are "core"
# More examples: https://github.com/twitterdev/getting-started-with-the-twitter-api-v2-for-academic-research/blob/main/modules/5-how-to-write-search-queries.md

# Use pagination if need more than 100 tweets
# query = tw.Paginator(client.search_recent_tweets, "HS2", max_results=10, limit=3)
# Use .flatten when looking through data if using paginator
# https://docs.tweepy.org/en/stable/pagination.html
# https://developer.twitter.com/en/docs/twitter-api/pagination

query = client.search_recent_tweets(query=hashtag, tweet_fields = tweet_fields, user_fields = user_fields, expansions = expansions, place_fields = place_fields, max_results=max_results)

# fields are necessary in queries to get details

In [8]:
tweets = [{'Author ID:' : tweet.author_id, 'Tweet:' : tweet.text, 'Time Created': tweet.created_at, 'Tweet_ID': tweet.id, 'Auto Context Annotations':tweet.context_annotations, 'Entities': tweet.entities, 
'Location':tweet.geo, 'Metrics':tweet.public_metrics} for tweet in query.data]

tweets_df = pd.DataFrame(tweets)

In [10]:
tweets_df['Auto Context Annotations Entity'] = np.empty((len(tweets_df), 0)).tolist()

for k, i in enumerate(tweets_df['Auto Context Annotations']):
    for j in range(len(i)):
        entity = i[j]['entity']['name']
        tweets_df.loc[k, 'Auto Context Annotations Entity'].append(entity)

In [11]:
entity_types = {'annotations':'normalized_text', 'cashtags': 'tag', 'hashtags': 'tag', 'mentions': 'username', 'urls': 'url'}
for i in entity_types.keys():
    tweets_df[i] = np.empty((len(tweets_df), 0)).tolist()
    entity_list = []
    for j, k in enumerate(tweets_df['Entities']):
        try:
            list_of_dicts = k[i]
            entity_list.append([a_dict[entity_types[i]] for a_dict in k[i]])
        except KeyError:
            entity_list.append([])
    tweets_df[i] = entity_list

In [12]:
metric_types = ["retweet_count", "reply_count", "like_count", "quote_count"]

for i in metric_types:
    tweets_df[i] = np.empty((len(tweets_df), 0)).tolist()
    metric_list = []
    for j in (tweets_df['Metrics']):
        metric_list.append(j[i])
    tweets_df[i] = metric_list

In [13]:
user_geo_reftweet = {}
for i in ['users','places','tweets']:
    try:
        parameter = query.includes[i]
        user_geo_reftweet[i] = {u for u in parameter}
    except KeyError:
        continue

# since we cannot extract geographical location yet
users_df = pd.DataFrame(user_geo_reftweet['users'])
ref_tweet_df = pd.DataFrame(user_geo_reftweet['tweets'])

In [31]:
tweets_df

Unnamed: 0,Author ID:,Tweet:,Time Created,Tweet_ID,Auto Context Annotations,Entities,Location,Metrics,Auto Context Annotations Entity,annotations,cashtags,hashtags,mentions,urls,retweet_count,reply_count,like_count,quote_count
0,804162745,RT @NTI26637228: #HS2 Lets be realistic if Bor...,2021-10-22 10:15:54+00:00,1451492502805090304,[],"{'hashtags': [{'start': 17, 'end': 21, 'tag': ...",,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",[],"[Boris, Schapps]",[],[HS2],[NTI26637228],[],1,0,0,0
1,1146327188893032448,#HS2 Lets be realistic if Boris &amp; Schapps ...,2021-10-22 10:14:19+00:00,1451492101800202248,[],"{'hashtags': [{'start': 0, 'end': 4, 'tag': 'H...",,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",[],"[Boris, Schapps]",[],[HS2],[],[],1,0,1,0
2,1003934161646866437,Guardian Goalposts are HS2 ready with our bran...,2021-10-22 10:10:03+00:00,1451491028293259283,"[{'domain': {'id': '66', 'name': 'Interests an...","{'hashtags': [{'start': 201, 'end': 205, 'tag'...",,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[Construction],[],[],"[HSE, RND, GS6, Rail, HS2, Construction, Safet...",[],"[https://t.co/osMAVKntd0, https://t.co/BZZiWSk...",0,0,0,0
3,1168244168344440832,@frances_cutler @NP_Partnership And #HS2 will ...,2021-10-22 10:07:08+00:00,1451490294730461192,[],"{'hashtags': [{'start': 36, 'end': 40, 'tag': ...",,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",[],[],[],[HS2],"[frances_cutler, NP_Partnership]",[],0,1,0,0
4,804162745,@NP_Partnership You should take heed of this. ...,2021-10-22 10:05:30+00:00,1451489885148286980,[],"{'hashtags': [{'start': 105, 'end': 109, 'tag'...",,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",[],[North],[],[HS2],[NP_Partnership],[https://t.co/1xLpBCpxQl],0,0,0,0
5,16855573,@rh52d @BrentPoland1 @FirstCelticFire That is ...,2021-10-22 10:04:32+00:00,1451489640481951753,"[{'domain': {'id': '65', 'name': 'Interests an...","{'hashtags': [{'start': 251, 'end': 255, 'tag'...",,"{'retweet_count': 0, 'reply_count': 0, 'like_c...","[Travel, General Travel, Adventure travel, Tra...",[],[],[HS2],"[rh52d, BrentPoland1, FirstCelticFire]",[],0,0,0,0
6,257392953,RT @mcahs2: Not much love for #HS2 in the nort...,2021-10-22 10:03:26+00:00,1451489366342242323,[],"{'hashtags': [{'start': 30, 'end': 34, 'tag': ...",,"{'retweet_count': 12, 'reply_count': 0, 'like_...",[],"[Anne Cherry, Trevor Bavage]",[],[HS2],[mcahs2],[],12,0,0,0
7,462353864,Find out how you can join the #HS2 family and ...,2021-10-22 10:00:37+00:00,1451488654614990862,"[{'domain': {'id': '45', 'name': 'Brand Vertic...","{'hashtags': [{'start': 30, 'end': 34, 'tag': ...",,"{'retweet_count': 0, 'reply_count': 0, 'like_c...","[Transportation, Railway transport, Careers, H...",[],[],"[HS2, JobSearch, JobsInConstruction]",[HS2ltd],"[https://t.co/bRX2Wdlt71, https://t.co/n2aBg9V...",0,0,0,0
8,284632871,"RT @mcahs2: They either, don't know what they ...",2021-10-22 10:00:09+00:00,1451488537048756224,[],"{'hashtags': [{'start': 93, 'end': 97, 'tag': ...",,"{'retweet_count': 3, 'reply_count': 0, 'like_c...",[],[],[],[HS2],[mcahs2],[https://t.co/xEgRoqbyVv],3,0,0,0
9,284632871,RT @mcahs2: Schrodinger's eastern leg. #HS2,2021-10-22 09:59:45+00:00,1451488436016267265,[],"{'hashtags': [{'start': 39, 'end': 43, 'tag': ...",,"{'retweet_count': 2, 'reply_count': 0, 'like_c...",[],[],[],[HS2],[mcahs2],[],2,0,0,0


In [40]:
ref_tweet_df['referenced_tweets'][0][0].id

1451475272272957467

In [30]:
ref_tweet_df['entities'][0]

{'hashtags': [{'start': 66, 'end': 70, 'tag': 'HS2'},
  {'start': 102, 'end': 116, 'tag': 'ReduceDriving'},
  {'start': 121, 'end': 134, 'tag': 'ReduceFlying'},
  {'start': 151, 'end': 168, 'tag': 'DomesticAviation'},
  {'start': 217, 'end': 228, 'tag': 'UKRailways'},
  {'start': 249, 'end': 260, 'tag': 'ModalShift'},
  {'start': 269, 'end': 282, 'tag': 'RoadBuilding'},
  {'start': 314, 'end': 324, 'tag': 'HSRFuture'}],
 'mentions': [{'start': 0,
   'end': 14,
   'username': 'NigelSarbutts',
   'id': '16855573'},
  {'start': 15,
   'end': 28,
   'username': 'BrentPoland1',
   'id': '1310304903471001604'},
  {'start': 29, 'end': 45, 'username': 'FirstCelticFire', 'id': '1172860423'}]}

In [48]:
users_pub_metrics = ["followers_count","following_count","tweet_count","listed_count"]

for i in users_pub_metrics:
    users_df[i] = np.empty((len(users_df), 0)).tolist()
    metric_list = []
    for j in (users_df['public_metrics']):
        metric_list.append(j[i])
    users_df[i] = metric_list

In [53]:
tweets_users_df = tweets_df.merge(users_df, left_on = 'Author ID:', right_on = 'id')
tweets_users_data = tweets_users_df.drop(columns = ['Auto Context Annotations', 'Entities', 'Metrics', 'public_metrics', 'Author ID:', 'id'])

In [58]:
print(tweets_users_data['Tweet:'][1])

RT @Hs2Rebellion: £2.8bn #HS2 contract 'shrouded in mystery', Siemens lawsuit claims. #HS2 bosses accused by Siemens Mobility of serious fa…


In [231]:
user_geo

{'users': {<User id=540672851 name=Laz, Lazarou Monkiest of Terrors 3.5%🏳️‍🌈💙🚀 username=FrancisMaudeAdv>,
  <User id=1187038420645924866 name=George Nugent Travels username=GeorgeNTravels>,
  <User id=2198370810 name=Jane Braybrook 🦊🌳🌱 username=JaneBraybrook1>,
  <User id=848996189584449537 name=Margaret username=Margaretb2107>,
  <User id=103576640 name=Carol H Scott username=MsCarolHScott>,
  <User id=1283540598427983872 name=Richard Blaber username=RMBlaber56>,
  <User id=1402276468013608963 name=Robert James 7.0👌🇬🇧🇬🇬⚒️👌 username=RobertJ52942435>,
  <User id=88545686 name=Pop Quiz! 👻🕸️📝 username=SeriousBismuth>,
  <User id=1422975858797465603 name=moiibnkknub username=moibnkjaaaa>}}

In [217]:
users = {u for u in query.includes["users"]}
#places = {p["id"]: p for p in query.includes["places"]}
users

{<User id=1131876443539902464 name=ขนิฏฐา username=KiN49NMOQOdEyws>,
 <User id=228385326 name=WORLIFTS                             #thinksafety username=Lifting_Equip>,
 <User id=1300924825905451008 name=หมดศรัทธา username=PCock45555111>,
 <User id=284162333 name=Rob Horgan username=Robbiehorgs>,
 <User id=377957668 name=Sarah Burgess username=SarahABurgess>,
 <User id=1411965418298920964 name=Yet Another Ban 2 username=YetAnotherBan21>,
 <User id=391081429 name=alicia pivaro username=aliciapivaro>,
 <User id=1231173747308138496 name=bryan mclean username=bryanmc75292461>,
 <User id=804162745 name=Frances Cutler username=frances_cutler>,
 <User id=3363298269 name=viv holliday ✉ username=holliday_viv>}

In [207]:
query.includes["users"]

[<User id=284162333 name=Rob Horgan username=Robbiehorgs>,
 <User id=3363298269 name=viv holliday ✉ username=holliday_viv>,
 <User id=391081429 name=alicia pivaro username=aliciapivaro>,
 <User id=1300924825905451008 name=หมดศรัทธา username=PCock45555111>,
 <User id=228385326 name=WORLIFTS                             #thinksafety username=Lifting_Equip>,
 <User id=804162745 name=Frances Cutler username=frances_cutler>,
 <User id=377957668 name=Sarah Burgess username=SarahABurgess>,
 <User id=1231173747308138496 name=bryan mclean username=bryanmc75292461>,
 <User id=1131876443539902464 name=ขนิฏฐา username=KiN49NMOQOdEyws>,
 <User id=1411965418298920964 name=Yet Another Ban 2 username=YetAnotherBan21>]

In [195]:
for tweet in query.data:
    if users[tweet.author_id]:
        print(tweet.author_id)
        user = users[tweet.author_id]
        print(user.location)

284162333
London
3363298269
Can sometimes be found indoors. Yorkshire
391081429
London
1300924825905451008
Thailand
228385326
Wildmoor, Worcs.  B61 0QU
804162745
None
377957668
sotogrande, spain
1231173747308138496
None
1131876443539902464
จ.นนทบุรี, ประเทศไทย
1411965418298920964
None


# Resources
https://dev.to/twitterdev/a-comprehensive-guide-for-using-the-twitter-api-v2-using-tweepy-in-python-15d9