# HS2 Tweets
### Importing Packages

In [2]:
import tweepy as tw
import pprint
import numpy as np
import nltk
import bokeh
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import re

from collections import Counter

from wordcloud import WordCloud

from bokeh.io import output_notebook, show, reset_output
from bokeh.plotting import figure
from bokeh.models import HoverTool
from bokeh.models import ColumnDataSource
from bokeh.palettes import Plasma256
from bokeh.models import Range1d

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize

from textblob import Word, TextBlob

### Twitter API Access & Set-Up

In [3]:
file = open(r"C:\Users\yeungf8452\OneDrive - ARCADIS\HS2 (Data Science)\HS2 Twitter\twitter_api.txt", "r").readlines()

api = file[0].strip('\n')
api_secret = file[1].strip('\n')
bearer_token = file[2].strip('\n')
access_token = file[3].strip('\n')
access_token_secret = file[4].strip('\n')

In [4]:
auth = tw.OAuthHandler(api, api_secret)
auth.set_access_token(access_token, access_token_secret)
api_app = tw.API(auth, wait_on_rate_limit = True)

### Query

In [1]:
client = tw.Client(bearer_token)

hashtag = "#HS2 OR #hs2 lang:en"
tweet_fields=['context_annotations', 'created_at', 'entities', 'geo', 'public_metrics', 'referenced_tweets']
user_fields = ['location', 'public_metrics']
# user fields: https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/user
expansions=['geo.place_id', 'author_id', 'referenced_tweets.id']
place_fields=['name', 'geo']
max_results = 10
# place fields: https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/place

# For search options: https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query
# Only can use operators that are "core"
# More examples: https://github.com/twitterdev/getting-started-with-the-twitter-api-v2-for-academic-research/blob/main/modules/5-how-to-write-search-queries.md

# Use pagination if need more than 100 tweets
# query = tw.Paginator(client.search_recent_tweets, "HS2", max_results=10, limit=3)
# Use .flatten when looking through data if using paginator
# https://docs.tweepy.org/en/stable/pagination.html
# https://developer.twitter.com/en/docs/twitter-api/pagination

query = client.search_recent_tweets(query=hashtag, tweet_fields = tweet_fields, user_fields = user_fields, expansions = expansions, place_fields = place_fields, max_results=max_results)

# fields are necessary in queries to get details

NameError: name 'tw' is not defined

### Tweet Information

In [188]:
tweets = [{'Author ID' : tweet.author_id, 'Tweet' : tweet.text, 'Time Created': tweet.created_at, 'Tweet_ID': tweet.id, 'Auto Context Annotations':tweet.context_annotations, 'Entities': tweet.entities, 
'Location':tweet.geo, 'Metrics':tweet.public_metrics, 'Ref Tweets':tweet.referenced_tweets} for tweet in query.data]

tweets_df = pd.DataFrame(tweets)

In [189]:
tweets_df['Auto Context Annotations Entity'] = np.empty((len(tweets_df), 0)).tolist()

for k, i in enumerate(tweets_df['Auto Context Annotations']):
    for j in range(len(i)):
        entity = i[j]['entity']['name']
        tweets_df.loc[k, 'Auto Context Annotations Entity'].append(entity)

In [190]:
entity_types = {'annotations':'normalized_text', 'cashtags': 'tag', 'hashtags': 'tag', 'mentions': 'username', 'urls': 'url'}
for i in entity_types.keys():
    tweets_df[i] = np.empty((len(tweets_df), 0)).tolist()
    entity_list = []
    for j, k in enumerate(tweets_df['Entities']):
        try:
            list_of_dicts = k[i]
            entity_list.append([a_dict[entity_types[i]] for a_dict in k[i]])
        except KeyError:
            entity_list.append([])
    tweets_df[i] = entity_list

In [191]:
metric_types = ["retweet_count", "reply_count", "like_count", "quote_count"]

for i in metric_types:
    tweets_df[i] = np.empty((len(tweets_df), 0)).tolist()
    metric_list = []
    for j in (tweets_df['Metrics']):
        metric_list.append(j[i])
    tweets_df[i] = metric_list

In [192]:
ref_tweet_id = []
ref_tweet_type = []
for i in tweets_df['Ref Tweets']:
    try:
        ref_tweet_id.append(i[0].id)
        ref_tweet_type.append(i[0].type)
    except:
        ref_tweet_id.append(0)
        ref_tweet_type.append(np.nan)
tweets_df['ref_tweet_id'] = (ref_tweet_id)
tweets_df['ref_tweet_type'] = ref_tweet_type


## User, Geographical and Reference Tweet

In [193]:
user_geo_reftweet = {}
for i in ['users','places','tweets']:
    try:
        parameter = query.includes[i]
        user_geo_reftweet[i] = {u for u in parameter}
    except KeyError:
        continue

users_df = pd.DataFrame(user_geo_reftweet['users'])
ref_tweet_df = pd.DataFrame(user_geo_reftweet['tweets'])

### User Information

In [194]:
users_pub_metrics = ["followers_count","following_count","tweet_count","listed_count"]

for i in users_pub_metrics:
    users_df[i] = np.empty((len(users_df), 0)).tolist()
    metric_list = []
    for j in (users_df['public_metrics']):
        metric_list.append(j[i])
    users_df[i] = metric_list

### Referenced Tweet Information

In [195]:
entity_types = {'annotations':'normalized_text', 'cashtags': 'tag', 'hashtags': 'tag', 'mentions': 'username', 'urls': 'url'}
for i in entity_types.keys():
    ref_tweet_df[i] = np.empty((len(ref_tweet_df), 0)).tolist()
    entity_list = []
    for j, k in enumerate(ref_tweet_df['entities']):
        try:
            list_of_dicts = k[i]
            entity_list.append([a_dict[entity_types[i]] for a_dict in k[i]])
        except KeyError:
            entity_list.append([])
    ref_tweet_df[i] = entity_list

In [196]:
metric_types = ["retweet_count", "reply_count", "like_count", "quote_count"]

for i in metric_types:
    ref_tweet_df[i] = np.empty((len(ref_tweet_df), 0)).tolist()
    metric_list = []
    for j in (ref_tweet_df['public_metrics']):
        metric_list.append(j[i])
    ref_tweet_df[i] = metric_list

In [156]:
# id_list = []
# for i, j in enumerate(ref_tweet_df['referenced_tweets']):
#     ref_tweet_df['ref_id'] = np.empty((len(ref_tweet_df), 0)).tolist()
#     try:
#         id_list.append(j[0].id)
#     except TypeError:
#         id_list.append(0)
# ref_tweet_df['ref_id'] = id_list

### Tweet, User, Geography and Referenced Tweet Information Merged

In [218]:
tweets_clean_df = tweets_df.drop(columns = ['Entities', 'Metrics', 'Ref Tweets', 'Auto Context Annotations'])
users_clean_df = users_df.drop(columns = ['public_metrics'])
ref_tweet_clean_df = ref_tweet_df.drop(columns = ['entities', 'public_metrics', 'context_annotations'])

In [217]:
ref_tweet_clean_df

Unnamed: 0,author_id,context_annotations,created_at,id,text,annotations,cashtags,hashtags,mentions,urls,retweet_count,reply_count,like_count,quote_count
0,706747004,"[{'domain': {'id': '10', 'name': 'Person', 'de...",2021-10-25 13:27:32+00:00,1452627891494207490,"Air quality is a matter of social justice, all...",[London],[],[],"[SadiqKhan, BBCPolitics]",[https://t.co/wnxhNfh8cL],3,11,25,1
1,1153265544877019136,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2021-10-18 22:09:10+00:00,1450222447911972869,Legacy Centre of Excellence and @LegacyImpactU...,[],[],"[business, businessowner, diversity, construct...","[LegacyImpactUK, HS2ltd]","[https://t.co/wGzTLGU7uF, https://t.co/hzx9PYJ...",5,0,10,1
2,1046748134683148289,,2021-10-25 14:30:05+00:00,1452643632024981510,#ROADWORKS\n\n🚧⛔ Overnight full closures for #...,[],[],"[ROADWORKS, HS2, M42, M6SouthLink, M6, Coleshill]",[HighwaysWMIDS],"[https://t.co/9NwExj5RO2, https://t.co/KM09OrV...",1,0,0,0
3,1166333571574960128,"[{'domain': {'id': '47', 'name': 'Brand', 'des...",2021-10-22 18:14:06+00:00,1451612843690889223,Speculation on why the govts Integrated Rail P...,[Midlands],[],"[HS2, NPR]","[theipaper, BorisJohnson]",[https://t.co/iJTeSIqztD],3,1,8,1
4,1166333571574960128,"[{'domain': {'id': '10', 'name': 'Person', 'de...",2021-10-24 13:55:29+00:00,1452272539149942793,Highlights from @MayorOfWY @TracyBrabin on @Ri...,[],[],"[HS2, NPR]","[MayorOfWY, TracyBrabin, RidgeOnSunday]",[https://t.co/by0rFWQi3W],9,5,36,1
5,1151206672527962112,,2021-10-25 14:22:21+00:00,1452641684890730506,The reduction in domestic aviation achievable ...,[UK],[],[hs2],[],[https://t.co/dFtgFpbpth],0,1,0,0
6,270869723,,2021-10-25 14:00:22+00:00,1452636154248011794,At Meet The Contractor 2021 find out how to gr...,[],[],"[procurement, SupplyChain, HS2]",[],"[https://t.co/P1zC2H2GSs, https://t.co/a4NTeej...",1,0,3,0


In [219]:
tweets_users_df = tweets_clean_df.merge(users_clean_df, left_on = 'Author ID', right_on = 'id', how = 'outer')
tweets_all = tweets_users_df.merge(ref_tweet_clean_df, left_on = 'ref_tweet_id', right_on = 'id', how = 'outer', suffixes=("_og", "_ref"))

In [266]:
just_tweets = [j if j is not np.nan else tweets_all['Tweet'][i] for i, j in enumerate(tweets_all['text'])]

### Text Preprocessing

In [268]:
stop_words = stopwords.words('english')
other_words = ['rt']
stop_words.extend(other_words)
tokenizer = RegexpTokenizer(r'\w+')

In [269]:
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                u"\U0001F600-\U0001F64F"  # emoticons
                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                u"\U0001F700-\U0001F77F"  # alchemical symbols
                                u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                                u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                                u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                                u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                                u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                                u"\U00002702-\U000027B0"  # Dingbats
                                u"\U000024C2-\U0001F251" 
                                "]+" , flags = re.UNICODE)
    return regrex_pattern.sub(r'', text)

In [270]:
def preprocessing_tweets(tweets):
    processed_tweets = []
    for i in tweets:

        #removing all emojis
        deemojified = deEmojify(i)
        
        # splitting with spaces
        tokenized = deemojified.split()
        each_tweet = []
        for i in tokenized:
            #lowercase all words
            lowered = i.lower()
            
            #removing all non-word characters
            lowered = re.sub('[^\w\@\']', '', lowered)
            
            #remove all links, stopwords and retweet handles
            # lemmatize all values
            if not('http' in lowered) and not(lowered in stop_words) and not('@' in lowered) and not ('nauguration' in lowered):
                each_tweet.append(Word(lowered).lemmatize())
                
        processed_tweets.append(" ".join(each_tweet))
    return(processed_tweets)

def hashtag(tweets):
    hashtags = [re.sub('#', '', j) for i in tweets for j in i.split() if (('#' in j) and ('HS2' not in j) and ('hs2' not in j))]
    return(hashtags)

In [241]:
polarity = []
subjectivity = []
for i in preprocessing_tweets(just_tweets):
    polarity.append(TextBlob(i).sentiment[0])
    subjectivity.append(TextBlob(i).sentiment[1])

In [271]:
hashtag_polarity = np.mean([TextBlob(i).sentiment[0] for i in hashtag(just_tweets)])
hashtag_subjectivity = np.mean([TextBlob(i).sentiment[1] for i in hashtag(just_tweets)])

print("Polarity of Hashtags: " + str(round(hashtag_polarity, 4)))
print("Subjectivity of Hashtags: " + str(round(hashtag_subjectivity,4)))

Polarity of Hashtags: 0.0
Subjectivity of Hashtags: 0.0


In [272]:
# Polarity of 1 means positive, -1 means negative
# Subjectivity of 1 means very subjective, 0 means non-subjective

polarity = []
subjectivity = []
for i in preprocessing_tweets(just_tweets):
    polarity.append(TextBlob(i).sentiment[0])
    subjectivity.append(TextBlob(i).sentiment[1])

hashtag_polarity = np.mean([TextBlob(i).sentiment[0] for i in hashtag(just_tweets)])
hashtag_subjectivity = np.mean([TextBlob(i).sentiment[1] for i in hashtag(just_tweets)])

print("Polarity of Hashtags: " + str(round(hashtag_polarity, 4)))
print("Subjectivity of Hashtags: " + str(round(hashtag_subjectivity,4)))

Polarity of Hashtags: 0.0
Subjectivity of Hashtags: 0.0


In [273]:
output_notebook()

x_scatter = polarity
y_scatter = subjectivity

# plot 
scatter_plot = figure(plot_width=500, plot_height=300, x_axis_label='Polarity', y_axis_label='Subjectivity')
scatter_plot.circle(x_scatter, y_scatter, size=5, line_color='navy', fill_color='gray', fill_alpha=0.5)
scatter_plot.add_tools(HoverTool())
show(scatter_plot)

In [274]:
words = []
for i in preprocessing_tweets(just_tweets):
    words.extend(i.split())
    
words_df = pd.DataFrame()
words_df['word'] = list(dict(Counter(words)).keys())
words_df['count'] = list(dict(Counter(words)).values())
words_df = words_df.sort_values(by=['count'], ascending = False)

In [275]:
p = figure(x_range=list(words_df.head(30)['word']), plot_height=350)

p.vbar(x =words_df['word'].head(30), top=words_df['count'].head(30), \
       width=0.9, line_color='white', \
       fill_color = random.sample(Plasma256,30))

p.xaxis.major_label_orientation = "vertical"
p.add_tools(HoverTool())
p.y_range=Range1d(0, 15)
p.title.text="Word Counts"
p.title.text_font_size = "25px"
p.title.align = 'center'
p.xaxis.axis_label = 'Word'
p.yaxis.axis_label = 'Count'

show(p)

## DistilBERT

In [253]:
#pip install git+https://github.com/huggingface/transformers.git

In [276]:
from transformers import pipeline

# Defaults to distilBERT
# Light version of BERT
classifier = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)
Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are 

In [284]:
db_label = []
db_score = []

for i, j in enumerate(preprocessing_tweets(just_tweets)):
    db_label.append(classifier(j)[0]['label'])
    db_score.append(classifier(j)[0]['score'])

distilbert_sentiment = pd.DataFrame(just_tweets)
distilbert_sentiment['Label'] = db_label
distilbert_sentiment['Score'] = db_score

In [287]:
distilbert_sentiment.iloc[7,0]   

'At Meet The Contractor 2021 find out how to grow your business with HS2. Get the latest on our #procurement pipelines and how you can become a vital part of the HS2 #SupplyChain. \nRegister now for the virtual event: https://t.co/P1zC2H2GSs #HS2 https://t.co/a4NTeejQJ4'

# Resources
https://dev.to/twitterdev/a-comprehensive-guide-for-using-the-twitter-api-v2-using-tweepy-in-python-15d9

In [None]:
# user_geo_reftweet = {}
# for j in ['users','places','tweets']:
#     # since we cannot extract geographical location yet, 'places' is unavailable 

#     try:
#         parameter = query.includes[j]
#     except:
#         continue
#     else:
#         ref_tweet_df = []
#         og_author_list = []
#         for i in range(max_results):
#             try:
#                 parameter = query.includes[j][i]
#                 ref_tweet_df.append(parameter)
#             except IndexError:
#                 continue
#             else:
#                 og_author_list.append(query.data[i]['author_id'])
#     user_geo_reftweet[j] = [ref_tweet_df, og_author_list]
        

In [None]:
# users_df = pd.DataFrame(user_geo_reftweet['users'][0])
# users_df['og_author_id'] = user_geo_reftweet['users'][1]

# ref_tweet_df = pd.DataFrame(user_geo_reftweet['tweets'][0])
# ref_tweet_df['og_author_id'] = user_geo_reftweet['tweets'][1]