# HS2 Tweets
### Importing Packages

In [57]:
import tweepy as tw
import pprint
import numpy as np
import nltk
import bokeh
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

from collections import Counter

from wordcloud import WordCloud

from bokeh.io import output_notebook, show, reset_output
from bokeh.plotting import figure
from bokeh.models import HoverTool
from bokeh.models import ColumnDataSource
from bokeh.palettes import Plasma256
from bokeh.models import Range1d

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize

from textblob import Word, TextBlob

### Twitter API Access & Set-Up

In [58]:
file = open(r"C:\Users\yeungf8452\OneDrive - ARCADIS\HS2 (Data Science)\HS2 Twitter\twitter_api.txt", "r").readlines()

api = file[0].strip('\n')
api_secret = file[1].strip('\n')
bearer_token = file[2].strip('\n')
access_token = file[3].strip('\n')
access_token_secret = file[4].strip('\n')

In [59]:
auth = tw.OAuthHandler(api, api_secret)
auth.set_access_token(access_token, access_token_secret)
api_app = tw.API(auth, wait_on_rate_limit = True)

### Query

In [60]:
client = tw.Client(bearer_token)

hashtag = "#HS2 OR #hs2 lang:en"
tweet_fields=['context_annotations', 'created_at', 'entities', 'geo', 'public_metrics']
user_fields = ['location', 'public_metrics']
# user fields: https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/user
expansions=['geo.place_id', 'author_id', 'referenced_tweets.id']
place_fields=['name', 'geo']
max_results = 10
# place fields: https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/place

# For search options: https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query
# Only can use operators that are "core"
# More examples: https://github.com/twitterdev/getting-started-with-the-twitter-api-v2-for-academic-research/blob/main/modules/5-how-to-write-search-queries.md

# Use pagination if need more than 100 tweets
# query = tw.Paginator(client.search_recent_tweets, "HS2", max_results=10, limit=3)
# Use .flatten when looking through data if using paginator
# https://docs.tweepy.org/en/stable/pagination.html
# https://developer.twitter.com/en/docs/twitter-api/pagination

query = client.search_recent_tweets(query=hashtag, tweet_fields = tweet_fields, user_fields = user_fields, expansions = expansions, place_fields = place_fields, max_results=max_results)

# fields are necessary in queries to get details

### Tweet Information

In [61]:
tweets = [{'Author ID:' : tweet.author_id, 'Tweet:' : tweet.text, 'Time Created': tweet.created_at, 'Tweet_ID': tweet.id, 'Auto Context Annotations':tweet.context_annotations, 'Entities': tweet.entities, 
'Location':tweet.geo, 'Metrics':tweet.public_metrics} for tweet in query.data]

tweets_df = pd.DataFrame(tweets)

In [62]:
tweets_df['Auto Context Annotations Entity'] = np.empty((len(tweets_df), 0)).tolist()

for k, i in enumerate(tweets_df['Auto Context Annotations']):
    for j in range(len(i)):
        entity = i[j]['entity']['name']
        tweets_df.loc[k, 'Auto Context Annotations Entity'].append(entity)

In [63]:
entity_types = {'annotations':'normalized_text', 'cashtags': 'tag', 'hashtags': 'tag', 'mentions': 'username', 'urls': 'url'}
for i in entity_types.keys():
    tweets_df[i] = np.empty((len(tweets_df), 0)).tolist()
    entity_list = []
    for j, k in enumerate(tweets_df['Entities']):
        try:
            list_of_dicts = k[i]
            entity_list.append([a_dict[entity_types[i]] for a_dict in k[i]])
        except KeyError:
            entity_list.append([])
    tweets_df[i] = entity_list

In [64]:
metric_types = ["retweet_count", "reply_count", "like_count", "quote_count"]

for i in metric_types:
    tweets_df[i] = np.empty((len(tweets_df), 0)).tolist()
    metric_list = []
    for j in (tweets_df['Metrics']):
        metric_list.append(j[i])
    tweets_df[i] = metric_list

## User, Geographical and Reference Tweet

In [139]:
query.data['users']

TypeError: list indices must be integers or slices, not str

In [65]:
user_geo_reftweet = {}
for i in ['users','places','tweets']:
    try:
        parameter = query.includes[i]
        user_geo_reftweet[i] = {u for u in parameter}
    except KeyError:
        continue

# since we cannot extract geographical location yet
users_df = pd.DataFrame(user_geo_reftweet['users'])
ref_tweet_df = pd.DataFrame(user_geo_reftweet['tweets'])

In [145]:
for i in query.includes['users']:
    print(i)

MonifiethU
gareth0108
paulsandham2
LuciferSide
rh52d
Me_Scuba_Steve
FJEB88
RandomRailways
DavidMartinCSP


### User Information

In [70]:
users_pub_metrics = ["followers_count","following_count","tweet_count","listed_count"]

for i in users_pub_metrics:
    users_df[i] = np.empty((len(users_df), 0)).tolist()
    metric_list = []
    for j in (users_df['public_metrics']):
        metric_list.append(j[i])
    users_df[i] = metric_list

### Referenced Tweet Information

In [73]:
entity_types = {'annotations':'normalized_text', 'cashtags': 'tag', 'hashtags': 'tag', 'mentions': 'username', 'urls': 'url'}
for i in entity_types.keys():
    ref_tweet_df[i] = np.empty((len(ref_tweet_df), 0)).tolist()
    entity_list = []
    for j, k in enumerate(ref_tweet_df['entities']):
        try:
            list_of_dicts = k[i]
            entity_list.append([a_dict[entity_types[i]] for a_dict in k[i]])
        except KeyError:
            entity_list.append([])
    ref_tweet_df[i] = entity_list

In [75]:
metric_types = ["retweet_count", "reply_count", "like_count", "quote_count"]

for i in metric_types:
    ref_tweet_df[i] = np.empty((len(ref_tweet_df), 0)).tolist()
    metric_list = []
    for j in (ref_tweet_df['public_metrics']):
        metric_list.append(j[i])
    ref_tweet_df[i] = metric_list

In [117]:
id_list = []
for i, j in enumerate(ref_tweet_df['referenced_tweets']):
    ref_tweet_df['ref_id'] = np.empty((len(ref_tweet_df), 0)).tolist()
    try:
        id_list.append(j[0].id)
    except TypeError:
        id_list.append(0)
ref_tweet_df['ref_id'] = id_list

### Tweet, User, Geography and Referenced Tweet Information Merged

In [126]:
tweets_users_df = tweets_df.merge(users_df, left_on = 'Author ID:', right_on = 'id')
tweets_users_data = tweets_users_df.drop(columns = ['Auto Context Annotations', 'Entities', 'Metrics', 'public_metrics', 'id'])

In [136]:
tweets_users_data

Unnamed: 0,Author ID:,Tweet:,Time Created,Tweet_ID,Location,Auto Context Annotations Entity,annotations,cashtags,hashtags,mentions,...,reply_count,like_count,quote_count,location,name,username,followers_count,following_count,tweet_count,listed_count
0,1397467234000379904,RT @RichardWellings: Ministers have effectivel...,2021-10-22 17:07:45+00:00,1451596148586786819,,[],[HS2],[],[],[RichardWellings],...,0,0,0,"Monifieth, Scotland UK",Tommy,MonifiethU,775,1047,29771,2
1,445452143,RT @PDeeley: Just heard today a local farmer h...,2021-10-22 17:04:50+00:00,1451595412587130885,,[],[150acres],[],[HS2],[PDeeley],...,0,0,0,exiting the brexiting,Gareth Evans Gammonologist 🕷💙,gareth0108,4540,4643,281489,19
2,4392337120,RT @PDeeley: Just heard today a local farmer h...,2021-10-22 17:04:28+00:00,1451595321776160772,,[],[150acres],[],[HS2],[PDeeley],...,0,0,0,,Paul Sandham,paulsandham2,1809,4672,31100,4
3,1201589544522522624,RT @HS2ltd: Watch a #timelapse video of how HS...,2021-10-22 17:02:40+00:00,1451594865926619149,,[],[],[],"[timelapse, Birmingham]",[HS2ltd],...,0,0,0,,Lucifer’s darker side,LuciferSide,65,155,8135,1
4,723045944644784128,@NigelSarbutts @dasy2k1 @Ceri_turns @Greens4HS...,2021-10-22 16:56:47+00:00,1451593386151006209,,[Travel],[],[],[HS2],"[NigelSarbutts, dasy2k1, Ceri_turns, Greens4HS...",...,1,0,0,"North East, England",Ryan Hogg,rh52d,256,1629,5167,1
5,723045944644784128,RT @Ceri_turns: @Greens4HS2 Excellent explanat...,2021-10-22 16:54:26+00:00,1451592797081976832,,[],[],[],"[HS2, GPConf, GPC21]","[Ceri_turns, Greens4HS2, CBGreenSteve]",...,0,0,0,"North East, England",Ryan Hogg,rh52d,256,1629,5167,1
6,19509724,@HS2ltd being sued by Siemens over “unlawful” ...,2021-10-22 16:54:12+00:00,1451592738789629979,,"[B2B, Services, Siemens]",[Siemens],[],[HS2],[HS2ltd],...,0,0,0,Planet Earth - probably!,Only Me 🇬🇧,Me_Scuba_Steve,418,354,14862,17
7,546746195,RT @RichardWellings: Ministers have effectivel...,2021-10-22 16:41:40+00:00,1451589583284707332,,[],[HS2],[],[],[RichardWellings],...,0,0,0,,FidelmaBack,FJEB88,1314,973,263242,1
8,833381839423299585,RT @Greens4HS2: What a day at #GPConf! 🎉\n\nIt...,2021-10-22 16:35:39+00:00,1451588069036105772,,[],[],[],[GPConf],[Greens4HS2],...,0,0,0,On a train/behind a camera,Random Railways,RandomRailways,1002,1173,13017,7
9,1667387035,RT @PDeeley: Just heard today a local farmer h...,2021-10-22 16:33:02+00:00,1451587410505211911,,[],[150acres],[],[HS2],[PDeeley],...,0,0,0,"Buckinghamshire, UK.",David Martin,DavidMartinCSP,2699,2682,40186,83


In [133]:
ref_tweet_df

Unnamed: 0,author_id,created_at,entities,geo,id,public_metrics,referenced_tweets,text,annotations,cashtags,hashtags,mentions,urls,retweet_count,reply_count,like_count,quote_count,ref_id
0,3996146667,2021-10-22 11:32:11+00:00,"{'hashtags': [{'start': 56, 'end': 60, 'tag': ...",,1451511698901319680,"{'retweet_count': 2, 'reply_count': 2, 'like_c...",,@Greens4HS2 Excellent explanation from @CBGree...,[],[],"[HS2, GPConf, GPC21]","[Greens4HS2, CBGreenSteve]",[],2,2,20,0,0
1,71280219,2021-10-22 11:12:17+00:00,"{'hashtags': [{'start': 215, 'end': 219, 'tag'...",,1451506690663321604,"{'retweet_count': 33, 'reply_count': 7, 'like_...",,Ministers have effectively written a blank che...,[HS2],[],[HS2],[],[https://t.co/0jVzP02yjc],33,7,47,3,0
2,16855573,2021-10-22 15:12:12+00:00,"{'mentions': [{'start': 0, 'end': 8, 'username...",,1451567066826293253,"{'retweet_count': 0, 'reply_count': 1, 'like_c...","[(type, id)]",@dasy2k1 @Ceri_turns @Greens4HS2 @CBGreenSteve...,[],[],[],"[dasy2k1, Ceri_turns, Greens4HS2, CBGreenSteve]",[],0,1,0,0,1451564873675124743
3,1255546610966302721,2021-10-22 15:40:06+00:00,"{'hashtags': [{'start': 14, 'end': 21, 'tag': ...",,1451574090817589254,"{'retweet_count': 7, 'reply_count': 1, 'like_c...","[(type, id)]",What a day at #GPConf! 🎉\n\nIt was great to se...,[],[],"[GPConf, HS2, GPC21]",[],[https://t.co/cqkv4jqba6],7,1,33,0,1451513534358110209
4,270869723,2021-10-22 16:00:43+00:00,"{'hashtags': [{'start': 8, 'end': 18, 'tag': '...",,1451579277791698960,"{'retweet_count': 1, 'reply_count': 1, 'like_c...",,Watch a #timelapse video of how HS2 contractor...,[Curzon Street Station],[],"[timelapse, Birmingham, HS2]",[],"[https://t.co/8kDSiPxlHa, https://t.co/Fyril4d...",1,1,17,0,0
5,263160690,2021-10-22 16:04:34+00:00,"{'hashtags': [{'start': 87, 'end': 91, 'tag': ...",{'place_id': '0580e33c2cf8e35c'},1451580247896465426,"{'retweet_count': 5, 'reply_count': 0, 'like_c...",,Just heard today a local farmer has had 150acr...,[150acres],[],"[HS2, HS2, HS2]",[],[],5,0,3,0,0


In [129]:
ref_tweet_df['text'][0]

'@Greens4HS2 Excellent explanation from @CBGreenSteve on #HS2 at #GPConf #GPC21 comparing it to building motorways, freeing up capacity on local routes. Counter-argument that this induces increased capacity is wierd, as we want to increase rail use instead of road!'

In [127]:
tweets_users_data.columns#.merge(ref_tweet_df, left_on = '')

Index(['Author ID:', 'Tweet:', 'Time Created', 'Tweet_ID', 'Location',
       'Auto Context Annotations Entity', 'annotations', 'cashtags',
       'hashtags', 'mentions', 'urls', 'retweet_count', 'reply_count',
       'like_count', 'quote_count', 'location', 'name', 'username',
       'followers_count', 'following_count', 'tweet_count', 'listed_count'],
      dtype='object')

In [132]:
tweets_users_data['Tweet:'][8]

'RT @Greens4HS2: What a day at #GPConf! 🎉\n\nIt was great to see (and actually speak to!) so many open-minded Greens who listened to the argum…'

# Resources
https://dev.to/twitterdev/a-comprehensive-guide-for-using-the-twitter-api-v2-using-tweepy-in-python-15d9