In [2]:
from mongodbcredentials import CONNECTION_STRING
from pymongo import MongoClient
import certifi
import pandas as pd
from twitter_modules import database_as_tweet, database_as_bert, pos_neg_count, plot_topics

In [3]:
client = MongoClient(CONNECTION_STRING, tlsCAFile=certifi.where())

In [4]:
twitter_facemasks = client.TwitterFacemasks
twitter_lockdown = client.TwitterLockdown
twitter_pcr = client.TwitterPCR
twitter_pfizer = client.TwitterPfizer
twitter_quarantine = client.TwitterQuarantine
twitter_restrictions = client.TwitterRestrictions
twitter_vaccine = client.TwitterVaccination

In [5]:
facemasks_dict = database_as_tweet(twitter_facemasks)
lockdown_dict = database_as_tweet(twitter_lockdown)
pcr_dict = database_as_tweet(twitter_pcr)
pfizer_dict = database_as_tweet(twitter_pfizer)
quarantine_dict = database_as_tweet(twitter_quarantine)
restrictions_dict = database_as_tweet(twitter_restrictions)
vaccine_dict = database_as_tweet(twitter_vaccine)


In [6]:
facemasks_df = pd.DataFrame(facemasks_dict, columns=['tweet'])
lockdown_df = pd.DataFrame(lockdown_dict, columns=['tweet'])
pcr_df = pd.DataFrame(pcr_dict, columns=['tweet'])
pfizer_df = pd.DataFrame(pfizer_dict, columns=['tweet'])
quarantine_df = pd.DataFrame(quarantine_dict, columns=['tweet'])
restrictions_df = pd.DataFrame(restrictions_dict, columns=['tweet'])
vaccine_df = pd.DataFrame(vaccine_dict, columns=['tweet'])

In [12]:
facemask_sentiment = facemasks_df.apply(database_as_bert, axis=1)

In [13]:
lockdown_sentiment = lockdown_df.apply(database_as_bert, axis=1)

In [14]:
pcr_sentiment = pcr_df.apply(database_as_bert, axis=1)

In [15]:
pfizer_sentiment = pfizer_df.apply(database_as_bert, axis=1)

In [16]:
quarantine_sentiment = quarantine_df.apply(database_as_bert, axis=1)

In [17]:
restrictions_sentiment = restrictions_df.apply(database_as_bert, axis=1)

In [7]:
for t in vaccine_df["tweet"]:
    if len(t)>500:
        print("THIS IS THE START OF THE TWEET: ", t + "\n")

THIS IS THE START OF THE TWEET:  @TWTThisIsNow @chimera414 @jimdtweet @Gordon_DHG @BubbasRanch @JohnDublin10 @4Clearsky @GlennCarr6 @Kenneth72712993 @BigRakaDoc @EricWil06256732 @GuyGadboisGuyG1 @wildweatherdan @robhon_ @try2golf @ejwwest @HalBrow68884227 @Tragiicomedy @covid_parent @Canadianworker2 @IanDJbrown2 @IngersolRobert @Narvuntien @dan613 @Boeing74 @organicdot @oakden_wolf @keithamccluskey @RobMeekel @25_cycle @sunsnowflowers @OscarsWild1 @BridgetHolmstro @rockandroll432 @MassiMassian @DaveJohn0175 @TheDisproof @rvrs_man @Shinedown1911 @priscian @FillmoreWhite @robinlarder @RealBeeOlogist @DawnTJ90 @SniemN @BarbaraGirouard @rln_nelson @Jaisans @Extinct55021670 @GeraldKutney Again why do you think they're not?
Public awareness has risen because of unprecedented vaccine injury/death. 
Reporting criteria (underreporting) as shown in the Harvard study is just as applicable now as before. 
I wasn't aware of VAERS until COVID (because of huge injury toll)

THIS IS THE START OF THE T

In [29]:
from scipy.special import softmax
import urllib.request
import csv
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [30]:
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]


model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", model_max_length=512, padding_side="right")
    
model.save_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
tokenizer.save_pretrained("cardiffnlp/twitter-roberta-base-sentiment", model_max_length=512, padding_side="right")

('cardiffnlp/twitter-roberta-base-sentiment\\tokenizer_config.json',
 'cardiffnlp/twitter-roberta-base-sentiment\\special_tokens_map.json',
 'cardiffnlp/twitter-roberta-base-sentiment\\vocab.json',
 'cardiffnlp/twitter-roberta-base-sentiment\\merges.txt',
 'cardiffnlp/twitter-roberta-base-sentiment\\added_tokens.json',
 'cardiffnlp/twitter-roberta-base-sentiment\\tokenizer.json')

In [71]:
def bert_preprocess(tweet):
    new_tweet = []
 
    for t in tweet.split(" "):
        t = '' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_tweet.append(t)
    print(len(new_tweet))
    return " ".join(new_tweet)

In [63]:
import emoji
import re

In [61]:
def strip_emoji(text):
    new_text = re.sub(emoji.get_emoji_regexp(), r"", text)
    return new_text

In [64]:
strip_emoji(tweets_problem[11315])

'dont forget people if you are having problems with your boss about the vaccine, join the WORKERS OF ENGLAND UNION this independent union is the ONLY union out there that is helping people, so give them a call, ENGLANDENGLANDENGLAND'

In [80]:
def bert_trial(df):
    i = 0
    list_of_dicts = []
    for t in df:
        print(i)
        empty = {}
        empty["tweet"] = t
        try:
            t = bert_preprocess(t)
            encoded_input = tokenizer(t, return_tensors='pt')
            output = model(**encoded_input)
        except:
            t = strip_emoji(t)
            encoded_input = tokenizer(t, return_tensors='pt')
            output = model(**encoded_input)
        finally:
            scores = output[0][0].detach().numpy()
            scores = softmax(scores)
            ranking = np.argsort(scores)
            ranking = ranking[::-1]
            empty["sentiment"] = labels[ranking[0]]
            i +=1 
            list_of_dicts += [empty]

        
    return list_of_dicts

In [12]:
vaccine_df.head()

Unnamed: 0,tweet
0,@NelsonAlmUK @HartmanDave @samstein Rare doesn...
1,@jkenney @CMOH_Alberta \nVaccine passports are...
2,@AlexSharrard @Kelsig3D @coopercooperco COVID ...
3,@AP TRADITIONAL VACCINE vs MRNA\n\nIf you want...
4,The FDA Produced First Batch of ‘Confidential’...


In [18]:
tweets_vaccine = []
for t in vaccine_df["tweet"]:
    tweets_vaccine += [t]
    #print(t)
    #if ("JOIN US SHORTLY AT 6.30pm." in t):
        #print(t)

In [23]:
problem_idx = 0
for i in range(0, len(tweets_vaccine)):
    if ("JOIN US SHORTLY AT 6.30pm." in tweets_vaccine[i]):
        print(tweets_vaccine[i])
        print(i)
        problem_idx = i

JOIN US SHORTLY AT 6.30pm. Parents and carers of 12-to-15-year-olds are invited to a webinar about the COVID-19 vaccination. Hosted by the NHS and Wandsworth Council, ask healthcare professionals questions about the vaccine. Join using the Teams link.
https://t.co/jFc0pVoolp https://t.co/XopAj1LPiN
65077


In [24]:
tweets_problem = tweets_vaccine[problem_idx-1:]

In [26]:
len(tweets_problem)

168992

In [1]:
text_over_500 = []
for t in vaccine_df["tweet"]:
    if len(t)>500:
        text_over_500 += [t]

NameError: name 'vaccine_df' is not defined

In [19]:
vaccine_over500 = pd.DataFrame(text_over_500, columns=["tweet"])

In [21]:
vaccine_over500

Unnamed: 0,tweet
0,@TWTThisIsNow @chimera414 @jimdtweet @Gordon_D...
1,@ChiefGreyCloud @FillmoreWhite @J_EmpyreanGoal...
2,@Krispi_Largo @annaeck73 @stopthepewpew @Badam...
3,@tenebra99 @doritmi @Kolyin @JimeeLiberty @Lia...
4,@AGoldsmithEsq @tenebra99 @doritmi @Kolyin @Ji...
...,...
1612,@Justice_Wins7 @A_Damned_smith @Wolfpak561 @wi...
1613,@sandcastle1975 @ejwwest @jimdtweet @4Clearsky...
1614,@jimdtweet @BigRakaDoc @Kenneth72712993 @sandc...
1615,@jimdtweet @BigRakaDoc @Kenneth72712993 @sandc...


In [27]:
problem_df = pd.DataFrame(tweets_problem, columns=["tweet"])

In [58]:
tweets_problem[11315]

'dont forget people if you are having problems with your boss about the vaccine, join the WORKERS OF ENGLAND UNION this independent union is the ONLY union out there that is helping people, so give them a call, 🏴\U000e0067\U000e0062\U000e0065\U000e006e\U000e0067\U000e007f🏴\U000e0067\U000e0062\U000e0065\U000e006e\U000e0067\U000e007fENGLAND🏴\U000e0067\U000e0062\U000e0065\U000e006e\U000e0067\U000e007f🏴\U000e0067\U000e0062\U000e0065\U000e006e\U000e0067\U000e007fENGLAND🏴\U000e0067\U000e0062\U000e0065\U000e006e\U000e0067\U000e007f🏴\U000e0067\U000e0062\U000e0065\U000e006e\U000e0067\U000e007fENGLAND🏴\U000e0067\U000e0062\U000e0065\U000e006e\U000e0067\U000e007f🏴\U000e0067\U000e0062\U000e0065\U000e006e\U000e0067\U000e007f🏴\U000e0067\U000e0062\U000e0065\U000e006e\U000e0067\U000e007f🏴\U000e0067\U000e0062\U000e0065\U000e006e\U000e0067\U000e007f🏴\U000e0067\U000e0062\U000e0065\U000e006e\U000e0067\U000e007f🏴\U000e0067\U000e0062\U000e0065\U000e006e\U000e0067\U000e007f🏴\U000e0067\U000e0062\U000e0065\U000

In [81]:
vaccine_sentiment = bert_trial(tweets_problem[11315:11320])#problem_df.apply(bert_trial, axis=1)

0
39
1
30
2
43
3
43
4
38


In [None]:
def positive_neg_count(df):
    total_sentiment = {}
    total_sentiment['positive'] = 0
    total_sentiment['negative'] = 0
    total_sentiment['neutral'] = 0
    num_of_posts = 0
    
    for index, row in df.iterrows():
        num_of_posts += 1
        if row['sentiment'] == 'positive':
            total_sentiment['positive'] += 1

        elif row['sentiment'] == 'negative':
            total_sentiment['negative'] += 1

        else:
            total_sentiment['neutral'] += 1

    total_sentiment['pos_perc'] = (total_sentiment['positive']/num_of_posts) * 100
    total_sentiment['neg_perc'] = (total_sentiment['negative']/num_of_posts) * 100
    total_sentiment['neu_perc'] = (total_sentiment['neutral']/num_of_posts) * 100
    
    return total_sentiment

In [None]:
facemasks_sentiment = positive_neg_count(facemask_sentiment)
lockdown_sentiment = positive_neg_count(lockdown_sentiment)
pcr_sentiment = positive_neg_count(pcr_sentiment)
pfizer_sentiment = positive_neg_count(pfizer_sentiment)
quarantine_sentiment = positive_neg_count(quarantine_sentiment)
restrictions_sentiment = positive_neg_count(restrictions_sentiment)
vaccine_sentiment = positive_neg_count(vaccine_sentiment)

In [None]:
list_of_topics = ['Facemasks', 'Lockdown', 'PCR', 'Pfizer', 'Quarantine', 'Restrictions', 'Vaccine']

In [None]:
query_dpts = []
positive = []
negative = []
neutral = []

for sentiment in facemasks_sentiment, lockdown_sentiment, pcr_sentiment, \
    pfizer_sentiment, quarantine_sentiment, restrictions_sentiment, vaccine_sentiment:
    positive += [sentiment['pos_perc']]
    negative += [sentiment['neg_perc']]
    neutral += [sentiment['neu_perc']]

query_dpts += [positive, negative, neutral]

In [None]:
plot_topics(list_of_topics, query_dpts)