# HS2 Tweets
### Importing Packages

In [2]:
import tweepy as tw
import pprint
import numpy as np
import nltk
import bokeh
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import re

from collections import Counter

from wordcloud import WordCloud

from bokeh.io import output_notebook, show, reset_output
from bokeh.plotting import figure
from bokeh.models import HoverTool
from bokeh.models import ColumnDataSource
from bokeh.palettes import Plasma256
from bokeh.models import Range1d

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize

from textblob import Word, TextBlob

### Twitter API Access & Set-Up

In [3]:
file = open(r"C:\Users\yeungf8452\OneDrive - ARCADIS\HS2 (Data Science)\HS2 Twitter\twitter_api.txt", "r").readlines()

api = file[0].strip('\n')
api_secret = file[1].strip('\n')
bearer_token = file[2].strip('\n')
access_token = file[3].strip('\n')
access_token_secret = file[4].strip('\n')

In [4]:
auth = tw.OAuthHandler(api, api_secret)
auth.set_access_token(access_token, access_token_secret)
api_app = tw.API(auth, wait_on_rate_limit = True)

### Query

In [5]:
client = tw.Client(bearer_token)

hashtag = "#HS2 OR #hs2 lang:en"
tweet_fields=['context_annotations', 'created_at', 'entities', 'geo', 'public_metrics']
user_fields = ['location', 'public_metrics']
# user fields: https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/user
expansions=['geo.place_id', 'author_id', 'referenced_tweets.id']
place_fields=['name', 'geo']
max_results = 10
# place fields: https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/place

# For search options: https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query
# Only can use operators that are "core"
# More examples: https://github.com/twitterdev/getting-started-with-the-twitter-api-v2-for-academic-research/blob/main/modules/5-how-to-write-search-queries.md

# Use pagination if need more than 100 tweets
# query = tw.Paginator(client.search_recent_tweets, "HS2", max_results=10, limit=3)
# Use .flatten when looking through data if using paginator
# https://docs.tweepy.org/en/stable/pagination.html
# https://developer.twitter.com/en/docs/twitter-api/pagination

query = client.search_recent_tweets(query=hashtag, tweet_fields = tweet_fields, user_fields = user_fields, expansions = expansions, place_fields = place_fields, max_results=max_results)

# fields are necessary in queries to get details

### Tweet Information

In [28]:
tweets = [{'Author ID:' : tweet.author_id, 'Tweet:' : tweet.text, 'Time Created': tweet.created_at, 'Tweet_ID': tweet.id, 'Auto Context Annotations':tweet.context_annotations, 'Entities': tweet.entities, 
'Location':tweet.geo, 'Metrics':tweet.public_metrics} for tweet in query.data]

tweets_df = pd.DataFrame(tweets)

In [62]:
tweets_df['Auto Context Annotations Entity'] = np.empty((len(tweets_df), 0)).tolist()

for k, i in enumerate(tweets_df['Auto Context Annotations']):
    for j in range(len(i)):
        entity = i[j]['entity']['name']
        tweets_df.loc[k, 'Auto Context Annotations Entity'].append(entity)

In [63]:
entity_types = {'annotations':'normalized_text', 'cashtags': 'tag', 'hashtags': 'tag', 'mentions': 'username', 'urls': 'url'}
for i in entity_types.keys():
    tweets_df[i] = np.empty((len(tweets_df), 0)).tolist()
    entity_list = []
    for j, k in enumerate(tweets_df['Entities']):
        try:
            list_of_dicts = k[i]
            entity_list.append([a_dict[entity_types[i]] for a_dict in k[i]])
        except KeyError:
            entity_list.append([])
    tweets_df[i] = entity_list

In [64]:
metric_types = ["retweet_count", "reply_count", "like_count", "quote_count"]

for i in metric_types:
    tweets_df[i] = np.empty((len(tweets_df), 0)).tolist()
    metric_list = []
    for j in (tweets_df['Metrics']):
        metric_list.append(j[i])
    tweets_df[i] = metric_list

## User, Geographical and Reference Tweet

In [54]:
user_geo_reftweet = {}
for i in ['users','places','tweets']:
    try:
        parameter = query.includes[i]
        user_geo_reftweet[i] = {u for u in parameter}
    except KeyError:
        continue

users_df = pd.DataFrame(user_geo_reftweet['users'])
ref_tweet_df = pd.DataFrame(user_geo_reftweet['tweets'])

In [56]:
users_df

Unnamed: 0,id,location,name,public_metrics,username
0,726138001743581184,Birmingham,Mark Hipwell,"{'followers_count': 3591, 'following_count': 3...",markhipwell1990
1,1201589544522522624,,Lucifer’s darker side,"{'followers_count': 65, 'following_count': 155...",LuciferSide
2,1935426432,,MCheshireAgainstHS2,"{'followers_count': 2566, 'following_count': 3...",mcahs2
3,764758972947652609,,Mark Jonathan,"{'followers_count': 66, 'following_count': 112...",jonathan640000
4,1332991729214033921,,Disheartened But Determined💙😷,"{'followers_count': 665, 'following_count': 81...",DisheartenedD
5,14698260,London,Chris Riggs,"{'followers_count': 172, 'following_count': 12...",cleuch
6,14931578,"London, Stourbridge, Dundee",K-smudged 🖐💙,"{'followers_count': 2338, 'following_count': 5...",smudge1
7,1886333310,,OutofTweet123,"{'followers_count': 1790, 'following_count': 1...",Outoftweet123


In [63]:
user_geo_reftweet = {}
for j in ['users','places','tweets']:
    # since we cannot extract geographical location yet, 'places' is unavailable 

    try:
        parameter = query.includes[j]
    except:
        continue
    else:
        ref_tweet_df = []
        og_author_list = []
        for i in range(max_results):
            try:
                parameter = query.includes[j][i]
                ref_tweet_df.append(parameter)
            except IndexError:
                continue
            else:
                og_author_list.append(query.data[i]['author_id'])
    user_geo_reftweet[j] = [ref_tweet_df, og_author_list]
        

In [68]:
users_df = pd.DataFrame(user_geo_reftweet['users'][0])
users_df['og_author_id'] = user_geo_reftweet['users'][1]

ref_tweet_df = pd.DataFrame(user_geo_reftweet['tweets'][0])
ref_tweet_df['og_author_id'] = user_geo_reftweet['tweets'][1]

In [70]:
ref_tweet_df

Unnamed: 0,author_id,context_annotations,created_at,entities,id,public_metrics,referenced_tweets,text,og_author_id
0,723045944644784128,"[{'domain': {'id': '67', 'name': 'Interests an...",2021-10-24 18:25:39+00:00,"{'mentions': [{'start': 0, 'end': 11, 'usernam...",1452340526028673046,"{'retweet_count': 0, 'reply_count': 1, 'like_c...","[(type, id)]",@EmBeck2015 @sarahb711 @BrentPoland1 @RoadsXR ...,1886333310
1,804162745,"[{'domain': {'id': '66', 'name': 'Interests an...",2021-10-24 12:28:45+00:00,"{'hashtags': [{'start': 177, 'end': 181, 'tag'...",1452250710934949907,"{'retweet_count': 5, 'reply_count': 1, 'like_c...",,Usual bodge. Euston descoped. Eastern leg slow...,14931578
2,1255546610966302721,,2021-10-24 16:30:59+00:00,"{'hashtags': [{'start': 93, 'end': 100, 'tag':...",1452311669238026248,"{'retweet_count': 12, 'reply_count': 1, 'like_...",,We're a bit overwhelmed with the number of Gre...,726138001743581184
3,270869723,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2021-10-24 14:01:01+00:00,"{'hashtags': [{'start': 45, 'end': 54, 'tag': ...",1452273928810991624,"{'retweet_count': 4, 'reply_count': 0, 'like_c...",,This is the charging cable for the first all #...,1201589544522522624
4,1151206672527962112,,2021-10-24 13:30:41+00:00,"{'hashtags': [{'start': 0, 'end': 4, 'tag': 'h...",1452266295106416644,"{'retweet_count': 2, 'reply_count': 1, 'like_c...",,#hs2 is little more than a hugely overpriced b...,14698260
5,1255546610966302721,,2021-10-24 14:38:04+00:00,"{'hashtags': [{'start': 68, 'end': 75, 'tag': ...",1452283255307350024,"{'retweet_count': 8, 'reply_count': 6, 'like_c...","[(type, id)]",Both HS2-related motions have fallen off the a...,1935426432
6,122715343,,2021-10-24 18:03:39+00:00,"{'hashtags': [{'start': 50, 'end': 54, 'tag': ...",1452334992370147339,"{'retweet_count': 4, 'reply_count': 2, 'like_c...",,The Independent says the Toton hub is dead - w...,1201589544522522624
7,1310304903471001604,,2021-10-24 17:33:41+00:00,"{'mentions': [{'start': 0, 'end': 11, 'usernam...",1452327447685443591,"{'retweet_count': 3, 'reply_count': 0, 'like_c...","[(type, id)]",@Greens4HS2 @PaulMBigland @bbcmtd @TheGreenPar...,764758972947652609
8,906082234490290176,,2021-10-24 10:05:05+00:00,"{'hashtags': [{'start': 260, 'end': 264, 'tag'...",1452214556550475777,"{'retweet_count': 89, 'reply_count': 12, 'like...",,I don't want high speed rail that will save 15...,1935426432


In [23]:
pd.DataFrame(ref_tweet_df)

Unnamed: 0,author_id,context_annotations,created_at,entities,id,public_metrics,referenced_tweets,text
0,723045944644784128,"[{'domain': {'id': '67', 'name': 'Interests an...",2021-10-24 18:25:39+00:00,"{'mentions': [{'start': 0, 'end': 11, 'usernam...",1452340526028673046,"{'retweet_count': 0, 'reply_count': 1, 'like_c...","[(type, id)]",@EmBeck2015 @sarahb711 @BrentPoland1 @RoadsXR ...
1,804162745,"[{'domain': {'id': '66', 'name': 'Interests an...",2021-10-24 12:28:45+00:00,"{'hashtags': [{'start': 177, 'end': 181, 'tag'...",1452250710934949907,"{'retweet_count': 5, 'reply_count': 1, 'like_c...",,Usual bodge. Euston descoped. Eastern leg slow...
2,1255546610966302721,,2021-10-24 16:30:59+00:00,"{'hashtags': [{'start': 93, 'end': 100, 'tag':...",1452311669238026248,"{'retweet_count': 12, 'reply_count': 1, 'like_...",,We're a bit overwhelmed with the number of Gre...
3,270869723,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2021-10-24 14:01:01+00:00,"{'hashtags': [{'start': 45, 'end': 54, 'tag': ...",1452273928810991624,"{'retweet_count': 4, 'reply_count': 0, 'like_c...",,This is the charging cable for the first all #...
4,1151206672527962112,,2021-10-24 13:30:41+00:00,"{'hashtags': [{'start': 0, 'end': 4, 'tag': 'h...",1452266295106416644,"{'retweet_count': 2, 'reply_count': 1, 'like_c...",,#hs2 is little more than a hugely overpriced b...
5,1255546610966302721,,2021-10-24 14:38:04+00:00,"{'hashtags': [{'start': 68, 'end': 75, 'tag': ...",1452283255307350024,"{'retweet_count': 8, 'reply_count': 6, 'like_c...","[(type, id)]",Both HS2-related motions have fallen off the a...
6,122715343,,2021-10-24 18:03:39+00:00,"{'hashtags': [{'start': 50, 'end': 54, 'tag': ...",1452334992370147339,"{'retweet_count': 4, 'reply_count': 2, 'like_c...",,The Independent says the Toton hub is dead - w...
7,1310304903471001604,,2021-10-24 17:33:41+00:00,"{'mentions': [{'start': 0, 'end': 11, 'usernam...",1452327447685443591,"{'retweet_count': 3, 'reply_count': 0, 'like_c...","[(type, id)]",@Greens4HS2 @PaulMBigland @bbcmtd @TheGreenPar...
8,906082234490290176,,2021-10-24 10:05:05+00:00,"{'hashtags': [{'start': 260, 'end': 264, 'tag'...",1452214556550475777,"{'retweet_count': 89, 'reply_count': 12, 'like...",,I don't want high speed rail that will save 15...


### User Information

In [70]:
users_pub_metrics = ["followers_count","following_count","tweet_count","listed_count"]

for i in users_pub_metrics:
    users_df[i] = np.empty((len(users_df), 0)).tolist()
    metric_list = []
    for j in (users_df['public_metrics']):
        metric_list.append(j[i])
    users_df[i] = metric_list

### Referenced Tweet Information

In [73]:
entity_types = {'annotations':'normalized_text', 'cashtags': 'tag', 'hashtags': 'tag', 'mentions': 'username', 'urls': 'url'}
for i in entity_types.keys():
    ref_tweet_df[i] = np.empty((len(ref_tweet_df), 0)).tolist()
    entity_list = []
    for j, k in enumerate(ref_tweet_df['entities']):
        try:
            list_of_dicts = k[i]
            entity_list.append([a_dict[entity_types[i]] for a_dict in k[i]])
        except KeyError:
            entity_list.append([])
    ref_tweet_df[i] = entity_list

In [75]:
metric_types = ["retweet_count", "reply_count", "like_count", "quote_count"]

for i in metric_types:
    ref_tweet_df[i] = np.empty((len(ref_tweet_df), 0)).tolist()
    metric_list = []
    for j in (ref_tweet_df['public_metrics']):
        metric_list.append(j[i])
    ref_tweet_df[i] = metric_list

In [117]:
id_list = []
for i, j in enumerate(ref_tweet_df['referenced_tweets']):
    ref_tweet_df['ref_id'] = np.empty((len(ref_tweet_df), 0)).tolist()
    try:
        id_list.append(j[0].id)
    except TypeError:
        id_list.append(0)
ref_tweet_df['ref_id'] = id_list

### Tweet, User, Geography and Referenced Tweet Information Merged

In [126]:
tweets_users_df = tweets_df.merge(users_df, left_on = 'Author ID:', right_on = 'id')
tweets_users_data = tweets_users_df.drop(columns = ['Auto Context Annotations', 'Entities', 'Metrics', 'public_metrics', 'id'])

### Text Preprocessing

In [152]:
stop_words = stopwords.words('english')
other_words = ['rt']
stop_words.extend(other_words)
tokenizer = RegexpTokenizer(r'\w+')

In [155]:
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                u"\U0001F600-\U0001F64F"  # emoticons
                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                u"\U0001F700-\U0001F77F"  # alchemical symbols
                                u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                                u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                                u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                                u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                                u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                                u"\U00002702-\U000027B0"  # Dingbats
                                u"\U000024C2-\U0001F251" 
                                "]+" , flags = re.UNICODE)
    return regrex_pattern.sub(r'', text)

In [165]:
def preprocessing_tweets(tweets):
    processed_tweets = []
    for i in tweets:

        #removing all emojis
        deemojified = deEmojify(i)
        
        # splitting with spaces
        tokenized = deemojified.split()
        each_tweet = []
        for i in tokenized:
            #lowercase all words
            lowered = i.lower()
            
            #removing all non-word characters
            lowered = re.sub('[^\w\@\']', '', lowered)
            
            #remove all links, stopwords and retweet handles
            # lemmatize all values
            if not('http' in lowered) and not(lowered in stop_words) and not('@' in lowered) and not ('nauguration' in lowered):
                each_tweet.append(Word(lowered).lemmatize())
                
        processed_tweets.append(" ".join(each_tweet))
    return(processed_tweets)

def hashtag(tweets):
    hashtags = [re.sub('#', '', j) for i in tweets for j in i.split() if (('#' in j) and ('HS2' not in j) and ('hs2' not in j))]
    return(hashtags)

In [168]:
polarity = []
subjectivity = []
for i in preprocessing_tweets(list(tweets_df['Tweet:'])):
    polarity.append(TextBlob(i).sentiment[0])
    subjectivity.append(TextBlob(i).sentiment[1])

In [171]:
hashtag_polarity = np.mean([TextBlob(i).sentiment[0] for i in hashtag(list(tweets_df['Tweet:']))])
hashtag_subjectivity = np.mean([TextBlob(i).sentiment[1] for i in hashtag(list(tweets_df['Tweet:']))])

print("Polarity of Hashtags: " + str(round(hashtag_polarity, 4)))
print("Subjectivity of Hashtags: " + str(round(hashtag_subjectivity,4)))

Polarity of Hashtags: 0.0
Subjectivity of Hashtags: 0.0


In [None]:
# Polarity of 1 means positive, -1 means negative
# Subjectivity of 1 means very subjective, 0 means non-subjective

polarity = []
subjectivity = []
for i in preprocessing_tweets(list(tweets_df['Tweet:'])):
    polarity.append(TextBlob(i).sentiment[0])
    subjectivity.append(TextBlob(i).sentiment[1])

hashtag_polarity = np.mean([TextBlob(i).sentiment[0] for i in hashtag(tweets)])
hashtag_subjectivity = np.mean([TextBlob(i).sentiment[1] for i in hashtag(tweets)])

print("Polarity of Hashtags: " + str(round(hashtag_polarity, 4)))
print("Subjectivity of Hashtags: " + str(round(hashtag_subjectivity,4)))

In [177]:
output_notebook()

x_scatter = polarity
y_scatter = subjectivity

# plot 
scatter_plot = figure(plot_width=500, plot_height=300, x_axis_label='Polarity', y_axis_label='Subjectivity')
scatter_plot.circle(x_scatter, y_scatter, size=5, line_color='navy', fill_color='gray', fill_alpha=0.5)
scatter_plot.add_tools(HoverTool())
show(scatter_plot)

In [1]:
words = []
for i in preprocessing_tweets(tweets):
    words.extend(i.split())
    
words_df = pd.DataFrame()
words_df['word'] = list(dict(Counter(words)).keys())
words_df['count'] = list(dict(Counter(words)).values())
words_df = words_df.sort_values(by=['count'], ascending = False)

NameError: name 'preprocessing_tweets' is not defined

In [132]:
p = figure(x_range=list(words_df.head(30)['word']), plot_height=350)

p.vbar(x =words_df['word'].head(30), top=words_df['count'].head(30), \
       width=0.9, line_color='white', \
       fill_color = random.sample(Plasma256,30))

p.xaxis.major_label_orientation = "vertical"
p.add_tools(HoverTool())
p.y_range=Range1d(0, 170)
p.title.text="Word Counts"
p.title.text_font_size = "25px"
p.title.align = 'center'
p.xaxis.axis_label = 'Word'
p.yaxis.axis_label = 'Count'

show(p)

'RT @Greens4HS2: What a day at #GPConf! 🎉\n\nIt was great to see (and actually speak to!) so many open-minded Greens who listened to the argum…'

# Resources
https://dev.to/twitterdev/a-comprehensive-guide-for-using-the-twitter-api-v2-using-tweepy-in-python-15d9