# HS2 Tweets
### Importing Packages

In [1]:
import tweepy as tw
import pprint
import numpy as np
import nltk
import bokeh
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import re

from collections import Counter

from wordcloud import WordCloud

from bokeh.io import output_notebook, show, reset_output
from bokeh.plotting import figure
from bokeh.models import HoverTool
from bokeh.models import ColumnDataSource
from bokeh.palettes import Plasma256
from bokeh.models import Range1d

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize

from textblob import Word, TextBlob

### Twitter API Access & Set-Up

In [2]:
file = open(r"C:\Users\yeungf8452\OneDrive - ARCADIS\HS2 (Data Science)\HS2 Twitter\twitter_api.txt", "r").readlines()

api = file[0].strip('\n')
api_secret = file[1].strip('\n')
bearer_token = file[2].strip('\n')
access_token = file[3].strip('\n')
access_token_secret = file[4].strip('\n')

In [3]:
auth = tw.OAuthHandler(api, api_secret)
auth.set_access_token(access_token, access_token_secret)
api_app = tw.API(auth, wait_on_rate_limit = True)

### Query

In [4]:
client = tw.Client(bearer_token)

hashtag = "#HS2 OR #hs2 lang:en"
tweet_fields=['context_annotations', 'created_at', 'entities', 'geo', 'public_metrics', 'referenced_tweets']
user_fields = ['location', 'public_metrics']
# user fields: https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/user
expansions=['geo.place_id', 'author_id', 'referenced_tweets.id']
place_fields=['name', 'geo']
max_results = 10
# place fields: https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/place

# For search options: https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query
# Only can use operators that are "core"
# More examples: https://github.com/twitterdev/getting-started-with-the-twitter-api-v2-for-academic-research/blob/main/modules/5-how-to-write-search-queries.md

# Use pagination if need more than 100 tweets
# query = tw.Paginator(client.search_recent_tweets, "HS2", max_results=10, limit=3)
# Use .flatten when looking through data if using paginator
# https://docs.tweepy.org/en/stable/pagination.html
# https://developer.twitter.com/en/docs/twitter-api/pagination

query = client.search_recent_tweets(query=hashtag, tweet_fields = tweet_fields, user_fields = user_fields, expansions = expansions, place_fields = place_fields, max_results=max_results)

# fields are necessary in queries to get details

### Tweet Information

In [5]:
tweets = [{'Author ID' : tweet.author_id, 'Tweet' : tweet.text, 'Time Created': tweet.created_at, 'Tweet_ID': tweet.id, 'Auto Context Annotations':tweet.context_annotations, 'Entities': tweet.entities, 
'Location':tweet.geo, 'Metrics':tweet.public_metrics, 'Ref Tweets':tweet.referenced_tweets} for tweet in query.data]

tweets_df = pd.DataFrame(tweets)

In [8]:
tweets_df['Auto Context Annotations Entity'] = np.empty((len(tweets_df), 0)).tolist()

for k, i in enumerate(tweets_df['Auto Context Annotations']):
    for j in range(len(i)):
        entity = i[j]['entity']['name']
        tweets_df.loc[k, 'Auto Context Annotations Entity'].append(entity)

In [9]:
tweets_df.head()

Unnamed: 0,Author ID,Tweet,Time Created,Tweet_ID,Auto Context Annotations,Entities,Location,Metrics,Ref Tweets,Auto Context Annotations Entity
0,4105532853,@Dawn4GBYT Could be a deliberate attempt to de...,2021-11-22 15:11:27+00:00,1462800903409446919,"[{'domain': {'id': '10', 'name': 'Person', 'de...","{'urls': [{'start': 283, 'end': 306, 'url': 'h...",,"{'retweet_count': 0, 'reply_count': 0, 'like_c...","[(type, id)]",[Gina Miller]
1,481108679,@helenhalcrow @LouiseAKennedy @EdwardJDavey @S...,2021-11-22 15:10:31+00:00,1462800667366539268,[],"{'mentions': [{'start': 0, 'end': 13, 'usernam...",,"{'retweet_count': 0, 'reply_count': 0, 'like_c...","[(type, id)]",[]
2,21776645,RT @CBItweets: “To solve this country’s transp...,2021-11-22 15:08:58+00:00,1462800280173510664,[],"{'mentions': [{'start': 3, 'end': 13, 'usernam...",,"{'retweet_count': 2, 'reply_count': 0, 'like_c...","[(type, id)]",[]
3,21776645,RT @CBItweets: “Our rail package is fantastic ...,2021-11-22 15:07:23+00:00,1462799880896663557,"[{'domain': {'id': '10', 'name': 'Person', 'de...","{'mentions': [{'start': 3, 'end': 13, 'usernam...",,"{'retweet_count': 2, 'reply_count': 0, 'like_c...","[(type, id)]","[Boris Johnson, Boris Johnson]"
4,21776645,RT @CBItweets: “It’s not a row-back – it’s bet...,2021-11-22 15:07:13+00:00,1462799836214816775,"[{'domain': {'id': '10', 'name': 'Person', 'de...","{'mentions': [{'start': 3, 'end': 13, 'usernam...",,"{'retweet_count': 2, 'reply_count': 0, 'like_c...","[(type, id)]","[Boris Johnson, Boris Johnson]"


In [10]:
entity_types = {'annotations':'normalized_text', 'cashtags': 'tag', 'hashtags': 'tag', 'mentions': 'username', 'urls': 'url'}
for i in entity_types.keys():
    tweets_df[i] = np.empty((len(tweets_df), 0)).tolist()
    entity_list = []
    for j, k in enumerate(tweets_df['Entities']):
        try:
            list_of_dicts = k[i]
            entity_list.append([a_dict[entity_types[i]] for a_dict in k[i]])
        except KeyError:
            entity_list.append([])
    tweets_df[i] = entity_list

In [11]:
metric_types = ["retweet_count", "reply_count", "like_count", "quote_count"]

for i in metric_types:
    tweets_df[i] = np.empty((len(tweets_df), 0)).tolist()
    metric_list = []
    for j in (tweets_df['Metrics']):
        metric_list.append(j[i])
    tweets_df[i] = metric_list

In [12]:
ref_tweet_id = []
ref_tweet_type = []
for i in tweets_df['Ref Tweets']:
    try:
        ref_tweet_id.append(i[0].id)
        ref_tweet_type.append(i[0].type)
    except:
        ref_tweet_id.append(0)
        ref_tweet_type.append(np.nan)
tweets_df['ref_tweet_id'] = (ref_tweet_id)
tweets_df['ref_tweet_type'] = ref_tweet_type


In [13]:
tweets_df.head()

Unnamed: 0,Author ID,Tweet,Time Created,Tweet_ID,Auto Context Annotations,Entities,Location,Metrics,Ref Tweets,Auto Context Annotations Entity,...,cashtags,hashtags,mentions,urls,retweet_count,reply_count,like_count,quote_count,ref_tweet_id,ref_tweet_type
0,4105532853,@Dawn4GBYT Could be a deliberate attempt to de...,2021-11-22 15:11:27+00:00,1462800903409446919,"[{'domain': {'id': '10', 'name': 'Person', 'de...","{'urls': [{'start': 283, 'end': 306, 'url': 'h...",,"{'retweet_count': 0, 'reply_count': 0, 'like_c...","[(type, id)]",[Gina Miller],...,[],"[socialcare, immigration, HS2]",[Dawn4GBYT],[https://t.co/4QSiNPMkOl],0,0,0,0,1462798162905776134,quoted
1,481108679,@helenhalcrow @LouiseAKennedy @EdwardJDavey @S...,2021-11-22 15:10:31+00:00,1462800667366539268,[],"{'mentions': [{'start': 0, 'end': 13, 'usernam...",,"{'retweet_count': 0, 'reply_count': 0, 'like_c...","[(type, id)]",[],...,[],[HS2],"[helenhalcrow, LouiseAKennedy, EdwardJDavey, S...",[],0,0,0,0,1462355628693803020,replied_to
2,21776645,RT @CBItweets: “To solve this country’s transp...,2021-11-22 15:08:58+00:00,1462800280173510664,[],"{'mentions': [{'start': 3, 'end': 13, 'usernam...",,"{'retweet_count': 2, 'reply_count': 0, 'like_c...","[(type, id)]",[],...,[],[],[CBItweets],[],2,0,0,0,1462725466788446210,retweeted
3,21776645,RT @CBItweets: “Our rail package is fantastic ...,2021-11-22 15:07:23+00:00,1462799880896663557,"[{'domain': {'id': '10', 'name': 'Person', 'de...","{'mentions': [{'start': 3, 'end': 13, 'usernam...",,"{'retweet_count': 2, 'reply_count': 0, 'like_c...","[(type, id)]","[Boris Johnson, Boris Johnson]",...,[],[],"[CBItweets, BorisJohnson]",[],2,0,0,0,1462731343566098437,retweeted
4,21776645,RT @CBItweets: “It’s not a row-back – it’s bet...,2021-11-22 15:07:13+00:00,1462799836214816775,"[{'domain': {'id': '10', 'name': 'Person', 'de...","{'mentions': [{'start': 3, 'end': 13, 'usernam...",,"{'retweet_count': 2, 'reply_count': 0, 'like_c...","[(type, id)]","[Boris Johnson, Boris Johnson]",...,[],[HS2],"[CBItweets, BorisJohnson]",[],2,0,0,0,1462731898317381638,retweeted


## User, Geographical and Reference Tweet

In [14]:
user_geo_reftweet = {}
for i in ['users','places','tweets']:
    try:
        parameter = query.includes[i]
        user_geo_reftweet[i] = {u for u in parameter}
    except KeyError:
        continue

users_df = pd.DataFrame(user_geo_reftweet['users'])
ref_tweet_df = pd.DataFrame(user_geo_reftweet['tweets'])

### User Information

In [15]:
users_pub_metrics = ["followers_count","following_count","tweet_count","listed_count"]

for i in users_pub_metrics:
    users_df[i] = np.empty((len(users_df), 0)).tolist()
    metric_list = []
    for j in (users_df['public_metrics']):
        metric_list.append(j[i])
    users_df[i] = metric_list

### Referenced Tweet Information

In [16]:
entity_types = {'annotations':'normalized_text', 'cashtags': 'tag', 'hashtags': 'tag', 'mentions': 'username', 'urls': 'url'}
for i in entity_types.keys():
    ref_tweet_df[i] = np.empty((len(ref_tweet_df), 0)).tolist()
    entity_list = []
    for j, k in enumerate(ref_tweet_df['entities']):
        try:
            list_of_dicts = k[i]
            entity_list.append([a_dict[entity_types[i]] for a_dict in k[i]])
        except KeyError:
            entity_list.append([])
    ref_tweet_df[i] = entity_list

In [17]:
metric_types = ["retweet_count", "reply_count", "like_count", "quote_count"]

for i in metric_types:
    ref_tweet_df[i] = np.empty((len(ref_tweet_df), 0)).tolist()
    metric_list = []
    for j in (ref_tweet_df['public_metrics']):
        metric_list.append(j[i])
    ref_tweet_df[i] = metric_list

In [156]:
# id_list = []
# for i, j in enumerate(ref_tweet_df['referenced_tweets']):
#     ref_tweet_df['ref_id'] = np.empty((len(ref_tweet_df), 0)).tolist()
#     try:
#         id_list.append(j[0].id)
#     except TypeError:
#         id_list.append(0)
# ref_tweet_df['ref_id'] = id_list

### Tweet, User, Geography and Referenced Tweet Information Merged

In [18]:
tweets_clean_df = tweets_df.drop(columns = ['Entities', 'Metrics', 'Ref Tweets', 'Auto Context Annotations'])
users_clean_df = users_df.drop(columns = ['public_metrics'])
ref_tweet_clean_df = ref_tweet_df.drop(columns = ['entities', 'public_metrics', 'context_annotations'])

In [21]:
tweets_users_df = tweets_clean_df.merge(users_clean_df, left_on = 'Author ID', right_on = 'id', how = 'outer')
tweets_all = tweets_users_df.merge(ref_tweet_clean_df, left_on = 'ref_tweet_id', right_on = 'id', how = 'outer', suffixes=("_og", "_ref"))

In [66]:
just_tweets = [j if j is not np.nan else tweets_all['text'][i] for i, j in enumerate(tweets_all['Tweet'])]

In [25]:
print(tweets_all.columns)

Index(['Author ID', 'Tweet', 'Time Created', 'Tweet_ID', 'Location',
       'Auto Context Annotations Entity', 'annotations_og', 'cashtags_og',
       'hashtags_og', 'mentions_og', 'urls_og', 'retweet_count_og',
       'reply_count_og', 'like_count_og', 'quote_count_og', 'ref_tweet_id',
       'ref_tweet_type', 'id_og', 'location', 'name', 'username',
       'followers_count', 'following_count', 'tweet_count', 'listed_count',
       'author_id', 'created_at', 'id_ref', 'referenced_tweets', 'text',
       'annotations_ref', 'cashtags_ref', 'hashtags_ref', 'mentions_ref',
       'urls_ref', 'retweet_count_ref', 'reply_count_ref', 'like_count_ref',
       'quote_count_ref'],
      dtype='object')


### Text Preprocessing

In [26]:
stop_words = stopwords.words('english')
other_words = ['rt'] #retweet
stop_words.extend(other_words)
tokenizer = RegexpTokenizer(r'\w+')

In [27]:
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                u"\U0001F600-\U0001F64F"  # emoticons
                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                u"\U0001F700-\U0001F77F"  # alchemical symbols
                                u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                                u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                                u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                                u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                                u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                                u"\U00002702-\U000027B0"  # Dingbats
                                u"\U000024C2-\U0001F251" 
                                "]+" , flags = re.UNICODE)
    return regrex_pattern.sub(r'', text)

In [28]:
def preprocessing_tweets(tweets):
    processed_tweets = []
    for i in tweets:

        #removing all emojis
        deemojified = deEmojify(i)
        
        # splitting with spaces
        tokenized = deemojified.split()
        each_tweet = []
        for i in tokenized:
            #lowercase all words
            lowered = i.lower()
            
            #removing all non-word characters
            lowered = re.sub('[^\w\@\']', '', lowered)
            
            #remove all links, stopwords and retweet handles
            # lemmatize all values
            if not('http' in lowered) and not(lowered in stop_words) and not('@' in lowered) and not ('nauguration' in lowered):
                each_tweet.append(Word(lowered).lemmatize())
                
        processed_tweets.append(" ".join(each_tweet))
    return(processed_tweets)

def hashtag(tweets):
    hashtags = [re.sub('#', '', j) for i in tweets for j in i.split() if (('#' in j) and ('HS2' not in j) and ('hs2' not in j))]
    return(hashtags)

In [67]:
# Preprocessed tweets vs Original Tweets
pd.DataFrame(list(zip(just_tweets, preprocessing_tweets(just_tweets))), columns = ['Original Tweet', 'Preprocessed Tweet'])

Unnamed: 0,Original Tweet,Preprocessed Tweet
0,@Dawn4GBYT Could be a deliberate attempt to de...,could deliberate attempt deflect socialcare bi...
1,@helenhalcrow @LouiseAKennedy @EdwardJDavey @S...,ed 'i support hs2' davey whose candidate's can...
2,RT @CBItweets: “To solve this country’s transp...,solve country transport problem cant endlessly...
3,RT @CBItweets: “Our rail package is fantastic ...,rail package fantastic 96bn improving rail se...
4,RT @CBItweets: “It’s not a row-back – it’s bet...,rowback better say prime minister defence rev...
5,RT @_PaulMonaghan: I recall meeting with #HS2 ...,recall meeting hs2 asked support lobbying effo...
6,RT @RichardWellings: The cancellation of the e...,cancellation eastern leg hs2 vindicates view d...
7,RT @mcahs2: They've spent the last two years d...,they've spent last two year ground investigati...
8,RT @ElRaynerista: LIVE SCENES as The Fat Conma...,live scene fat conman seal thomas hs2 tank eng...
9,#HS2 boosts job opportunities for young people...,hs2 boost job opportunity young people west mi...


In [68]:
# Using the number of followers as a proxy for the importance of the tweet

followers_count_normalised = []
for i in tweets_all['followers_count']:
    followers_count_normalised.append((i - min(tweets_all['followers_count']))/(max(tweets_all['followers_count'] - min(tweets_all['followers_count']))))

In [69]:
# Polarity of 1 means positive, -1 means negative
# Subjectivity of 1 means very subjective, 0 means non-subjective

polarity = []
subjectivity = []
for i in preprocessing_tweets(just_tweets):
    polarity.append(TextBlob(i).sentiment[0])
    subjectivity.append(TextBlob(i).sentiment[1])

tweet_polarity = np.mean(polarity)
tweet_subjectivity = np.mean(subjectivity)

print("Polarity of Tweet: " + str(round(tweet_polarity, 4)))
print("Subjectivity of Tweet: " + str(round(tweet_subjectivity,4)))

# Overall, the polarity and subjectivity appears to be on the positive side, although judging by the raw data, this does not seem to be the case

Polarity of Tweet: 0.0606
Subjectivity of Tweet: 0.3817


In [73]:
scaled_polarity = [i * j for i, j in zip(followers_count_normalised, polarity)]
scaled_subjectivity = [i * j for i, j in zip(followers_count_normalised, subjectivity)]

print("Polarity of Tweet (with scaling): " + str(round(np.mean(scaled_polarity), 4)))
print("Subjectivity of Tweet (with scaling): " + str(round(np.mean(scaled_subjectivity),4)))

Polarity of Tweet (with scaling): -0.033
Subjectivity of Tweet (with scaling): 0.0668


In [30]:
# Because hashtags are single words and do not reflect a lot of meaning, the polarity and subjectivity of each hashtag couldn't be identified

# Polarity of 1 means positive, -1 means negative
# Subjectivity of 1 means very subjective, 0 means non-subjective

hashtag_polarity = np.mean([TextBlob(i).sentiment[0] for i in hashtag(just_tweets)])
hashtag_subjectivity = np.mean([TextBlob(i).sentiment[1] for i in hashtag(just_tweets)])

print("Polarity of Hashtags: " + str(round(hashtag_polarity, 4)))
print("Subjectivity of Hashtags: " + str(round(hashtag_subjectivity,4)))

Polarity of Hashtags: 0.0
Subjectivity of Hashtags: 0.0


In [74]:
output_notebook()

x_scatter = polarity
y_scatter = subjectivity

# plot 
scatter_plot = figure(plot_width=500, plot_height=300, x_axis_label='Polarity', y_axis_label='Subjectivity')
scatter_plot.circle(x_scatter, y_scatter, size=5, line_color='navy', fill_color='gray', fill_alpha=0.5)
scatter_plot.add_tools(HoverTool())
show(scatter_plot)

In [41]:
words = []
for i in preprocessing_tweets(just_tweets):
    words.extend(i.split())
    
words_df = pd.DataFrame()
words_df['word'] = list(dict(Counter(words)).keys())
words_df['count'] = list(dict(Counter(words)).values())
words_df = words_df.sort_values(by=['count'], ascending = False)

In [75]:
p = figure(x_range=list(words_df.head(30)['word']), plot_height=350)

p.vbar(x =words_df['word'].head(30), top=words_df['count'].head(30), \
       width=0.9, line_color='white', \
       fill_color = random.sample(Plasma256,30))

p.xaxis.major_label_orientation = "vertical"
p.add_tools(HoverTool())
p.y_range=Range1d(0, 15)
p.title.text="Word Counts"
p.title.text_font_size = "25px"
p.title.align = 'center'
p.xaxis.axis_label = 'Word'
p.yaxis.axis_label = 'Count'

show(p)

## DistilBERT

In [253]:
#pip install git+https://github.com/huggingface/transformers.git

In [43]:
from transformers import pipeline

# Defaults to distilBERT
# Light version of BERT
classifier = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)
All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [76]:
db_label = []
db_score = []

for i, j in enumerate(preprocessing_tweets(just_tweets)):
    db_label.append(classifier(j)[0]['label'])
    db_score.append(classifier(j)[0]['score'])

distilbert_sentiment = pd.DataFrame(just_tweets)
distilbert_sentiment['Label'] = db_label
distilbert_sentiment['Score'] = db_score

In [77]:
# Using the Distilbert Sentiment analysis to identify the sentiment of each comment
distilbert_sentiment

Unnamed: 0,0,Label,Score
0,@Dawn4GBYT Could be a deliberate attempt to de...,NEGATIVE,0.998839
1,@helenhalcrow @LouiseAKennedy @EdwardJDavey @S...,NEGATIVE,0.990207
2,RT @CBItweets: “To solve this country’s transp...,NEGATIVE,0.969417
3,RT @CBItweets: “Our rail package is fantastic ...,POSITIVE,0.999426
4,RT @CBItweets: “It’s not a row-back – it’s bet...,NEGATIVE,0.993742
5,RT @_PaulMonaghan: I recall meeting with #HS2 ...,NEGATIVE,0.983938
6,RT @RichardWellings: The cancellation of the e...,NEGATIVE,0.845511
7,RT @mcahs2: They've spent the last two years d...,NEGATIVE,0.994045
8,RT @ElRaynerista: LIVE SCENES as The Fat Conma...,NEGATIVE,0.993588
9,#HS2 boosts job opportunities for young people...,POSITIVE,0.998989


# Resources
https://dev.to/twitterdev/a-comprehensive-guide-for-using-the-twitter-api-v2-using-tweepy-in-python-15d9

In [None]:
# user_geo_reftweet = {}
# for j in ['users','places','tweets']:
#     # since we cannot extract geographical location yet, 'places' is unavailable 

#     try:
#         parameter = query.includes[j]
#     except:
#         continue
#     else:
#         ref_tweet_df = []
#         og_author_list = []
#         for i in range(max_results):
#             try:
#                 parameter = query.includes[j][i]
#                 ref_tweet_df.append(parameter)
#             except IndexError:
#                 continue
#             else:
#                 og_author_list.append(query.data[i]['author_id'])
#     user_geo_reftweet[j] = [ref_tweet_df, og_author_list]
        

In [None]:
# users_df = pd.DataFrame(user_geo_reftweet['users'][0])
# users_df['og_author_id'] = user_geo_reftweet['users'][1]

# ref_tweet_df = pd.DataFrame(user_geo_reftweet['tweets'][0])
# ref_tweet_df['og_author_id'] = user_geo_reftweet['tweets'][1]