In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(15,8)})
from datetime import datetime, timedelta
import re
import json
import spacy
import string

from scipy.stats import ttest_ind
import statsmodels.formula.api as smf

# Local Modules
import matplotlib as mpl
import matplotlib.font_manager as font_manager
from matplotlib.lines import Line2D


# Granger test between GTrends views and tweets
import statsmodels.api as sm

## Data parsing

In [2]:
# build a personalyze data parser
custom_date = lambda x: datetime.strptime(x.split()[0], '%Y-%m-%d')
df = pd.read_csv('extra_material/tweets.csv', parse_dates=['date'], date_parser=custom_date)

FileNotFoundError: [Errno 2] No such file or directory: 'extra_material/tweets.csv'

In [None]:
df_covid = df[df['date'] >= '2019-12-01']

In [None]:
time_span = pd.date_range(start = min(df_covid['date']), end = max(df_covid['date']), freq='D')

data=df_covid.copy()
data = pd.merge(data, pd.DataFrame(time_span, columns=['date']), on='date', how='right')
data.dropna(inplace=True)

In [None]:
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
print('Number of stop words: %d' % len(spacy_stopwords))

In [None]:
#all tweets in one string
result_text = ' '.join(data['text'])

def remove_stopwords(chunk):
    return ' '.join([token.text for token in chunk if (not token.is_stop)])

#the text we have is too big to process it at once, so we split it into chunks
chunk_size = 100000
chunks = [result_text[i:i+chunk_size] for i in range(0, len(result_text), chunk_size)]
processed_chunks = [nlp(chunk) for chunk in chunks]
processed_chunks_no_stopwords = [remove_stopwords(chunk) for chunk in processed_chunks]

#manually clean text from links, punctuation, etc.
clean = ' '.join(processed_chunks_no_stopwords)
clean= re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '',clean)#remove links
clean= clean.translate(str.maketrans('', '', string.punctuation))#remove punctuation
clean = clean.replace('amp', '')
clean = clean.replace('RT', '')
clean = clean.replace('realDonaldTrump', '')

## Sentiment analysis 

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()


In [None]:
data['scores'] = data['text'].apply(lambda Description: sid.polarity_scores(Description))  

#calculate the sentiment score which is a combination of positive, negative and neutral(not mean)
data['compound']  = data['scores'].apply(lambda score_dict: score_dict['compound'])
data['sentiment_type']=''
data.loc[data.compound>0,'sentiment_type']='POSITIVE'
data.loc[data.compound==0,'sentiment_type']='NEUTRAL'
data.loc[data.compound<0,'sentiment_type']='NEGATIVE'

## Covid related

In [None]:
#create a list of words related to covid
list_of_words = ['Covid19' ,'virus', 'vaccine', 'Flu', 'Covid', 'lockdown', 'pandemic', 'coronavirus', 'Coronavirus', 'Corona', 'corona', 'COVID19', 'COVID', 'covid', 'quarantine', 'Quarantine', 'quarentine', 'Quarentine', 'quarantined', 'Quarantined', 'quarentined', 'Quarentined', 'quarantining', 'Quarantining', 'quarentining', 'Quarentining', 'quarantines', 'Quarantines', 'quarentines', 'Quarentines', 'quarantine', 'Quarantine', 'quarentine', 'Quarentine', 'quarantining', 'Quarantining', 'quarentining', 'Quarentining', 'quarantines', 'Quarantines', 'quarentines', 'Quarentines', 'quarantine', 'Quarantine', 'quarentine', 'Quarentine', 'quarantining', 'Quarantining', 'quarentining', 'Quarentining', 'quarantines', 'Quarantines', 'quarentines', 'Quarentines', 'quarantine', 'Quarantine', 'quarentine', 'Quarentine', 'quarantining', 'Quarantining', 'quarentining', 'Quarentining', 'quarantines', 'Quarantines', 'quarentines', 'Quarentines']
data['covid_related'] = data['text'].apply(lambda x: True if any(word in x for word in list_of_words) else False)

data['covid_related'].value_counts()

In [None]:
#data where covid_related is true
data_covid_rel = data[data['covid_related']==1]
data_not_covid_rel = data[data['covid_related']==0]

t-test between covid related and non related mean of retweets :

In [None]:
data_covid_rel = data[data['covid_related']==1]
data_not_covid_rel = data[data['covid_related']==0]
comparison_covid = pd.DataFrame({'Covid related':data_covid_rel.retweets, 'Non covid related':data_not_covid_rel.retweets})

t_stat, p_value_covid = ttest_ind(data_covid_rel.retweets, data_not_covid_rel.retweets, equal_var=False)
print('The p-value of the t-test is', p_value_covid.round(5))

## LeakTheory related

In [None]:
fake_news_leaktheory = ['China virus', 'china virus', 'chinavirus', 'Chinavirus', 'China Virus']
analyze_leaktheory = data['text'].apply(lambda x: True if any(word in x for word in fake_news_leaktheory) else False)

# create a new column with the result of the analysis
new_col = analyze_leaktheory.reindex(data.index, fill_value=False)
data['Leaktheory'] = new_col
print('Number of tweets talking about COVID-19 lab leak theory is: ', data['Leaktheory'].sum())


In [None]:
data_Leaktheory_rel = data[data['Leaktheory']==1]
data_not_Leaktheory_rel = data[data['Leaktheory']==0]
comparison_Leaktheory = pd.DataFrame({'Leaktheory related':data_Leaktheory_rel.retweets, 'Non Leaktheory related':data_not_Leaktheory_rel.retweets})

t_stat, p_value_Leaktheory = ttest_ind(data_Leaktheory_rel.retweets, data_not_Leaktheory_rel.retweets, equal_var=False)
print('The p-value is', p_value_Leaktheory.round(5))

## Swine flu related

In [None]:
fake_news_swineflu = ['Swine', 'swine', 'H1N1', 'h1n1']
analyze_swineflu = data['text'].apply(lambda x: True if any(word in x for word in fake_news_swineflu) else False)

# create a new column with the result of the analysis
new_col = analyze_swineflu.reindex(data.index, fill_value=False)
data['Swineflu'] = new_col

In [None]:
data_Swineflu_rel = data[data['Swineflu']==1]
data_not_Swineflu_rel = data[data['Swineflu']==0]
comparison_Swineflu = pd.DataFrame({'Swineflu related':data_Swineflu_rel.retweets, 'Non Swineflu related':data_not_Swineflu_rel.retweets})

t_stat, p_value_Swineflu = ttest_ind(data_Swineflu_rel.retweets, data_not_Swineflu_rel.retweets, equal_var=False)
print('The p-value of the t-test is', p_value_Swineflu.round(5))

## Hydroxychloroquine related

In [None]:
fake_news = ['Hydroxychloroquine', 'hydroxychloroquine']
analyze = data['text'].apply(lambda x: True if any(word in x for word in fake_news) else False)

# create a new column with the result of the analysis
new_col = analyze.reindex(data.index, fill_value=False)
data['Hydroxychloroquine']=new_col

print('Number of tweets talking about Hydroxychloroquine is: ', data['Hydroxychloroquine'].sum())

In [None]:
data_Hydroxychloroquine_rel = data[data['Hydroxychloroquine']==1]
data_not_Hydroxychloroquine_rel = data[data['Hydroxychloroquine']==0]
comparison_Hydroxychloroquine = pd.DataFrame({'Hydroxychloroquine related':data_Hydroxychloroquine_rel.retweets, 'Non Hydroxychloroquine related':data_not_Hydroxychloroquine_rel.retweets})

t_stat, p_value_Hydroxychloroquine = ttest_ind(data_Hydroxychloroquine_rel.retweets, data_not_Hydroxychloroquine_rel.retweets, equal_var=False)
print('The p-value of the t-test is', p_value_Hydroxychloroquine.round(5))

## Biden related

In [None]:
biden = ['Joe Biden','biden','joe','Joe','Biden','sleepy']
analyze_biden = data['text'].apply(lambda x: True if any(word in x for word in biden) else False)

# create a new column with the result of the analysis
new_col = analyze_biden.reindex(data.index, fill_value=False)
data['biden'] = new_col
print('Number of tweets talking about biden is: ', data['biden'].sum())


In [None]:
data_biden_rel = data[data['biden']==1]
data_not_biden_rel = data[data['biden']==0]
comparison_biden = pd.DataFrame({'biden related':data_biden_rel.retweets, 'Non biden related':data_not_biden_rel.retweets})

t_stat, p_value_biden = ttest_ind(data_biden_rel.retweets, data_not_biden_rel.retweets, equal_var=False)
print('The p-value of the t-test is', p_value_biden.round(5))

## Democrats related

In [None]:
democrats = ['Democrats','democrats','Dems','dems']
analyze_democrats = data['text'].apply(lambda x: True if any(word in x for word in democrats) else False)

# create a new column with the result of the analysis
new_col = analyze_democrats.reindex(data.index, fill_value=False)
data['democrats'] = new_col
print('Number of tweets talking about democrats is: ', data['democrats'].sum())

In [None]:
data_democrats_rel = data[data['democrats']==1]
data_not_democrats_rel = data[data['democrats']==0]
comparison_democrats = pd.DataFrame({'democrats related':data_democrats_rel.retweets, 'Non democrats related':data_not_democrats_rel.retweets})

t_stat, p_value_democrats = ttest_ind(data_democrats_rel.retweets, data_not_democrats_rel.retweets, equal_var=False)
print('The p-value of the t-test is', p_value_democrats.round(5))

## Vaccines related

In [None]:
vaccines = ['vaccines','Vaccines','vax','Vax','vaccin']
analyze_vaccines = data['text'].apply(lambda x: True if any(word in x for word in democrats) else False)

# create a new column with the result of the analysis
new_col = analyze_vaccines.reindex(data.index, fill_value=False)
data['vaccines'] = new_col
print('Number of tweets talking about vaccines of the t-test is: ', data['vaccines'].sum())

In [None]:
data_vaccines_rel = data[data['vaccines']==1]
data_not_vaccines_rel = data[data['vaccines']==0]
comparison_vaccines = pd.DataFrame({'vaccines related':data_vaccines_rel.retweets, 'Non vaccines related':data_not_vaccines_rel.retweets})


t_stat, p_value_vaccines = ttest_ind(data_vaccines_rel.retweets, data_not_vaccines_rel.retweets, equal_var=False)
print('The p-value of the t-test is', p_value_vaccines.round(5))

## Aggregate results into more general categories:

In [None]:
data['is_fake_news'] = (data['Hydroxychloroquine']|data['Swineflu']|data['Leaktheory'])
data['is_democrats_related'] = (data['biden']|data['democrats'])

In [None]:
data['is_democrats_related'] .sum()

In [None]:
data_fake_news_rel = data[data['is_fake_news']==1]
data_not_fake_news_rel = data[data['is_fake_news']==0]
comparison_fake_news = pd.DataFrame({'fake_news related':data_fake_news_rel.retweets, 'Non fake_news related':data_not_fake_news_rel.retweets})


t_stat, p_value_fake_news = ttest_ind(data_fake_news_rel.retweets, data_not_fake_news_rel.retweets, equal_var=False)
print('The p-value of the t-test is', p_value_fake_news.round(5))

## Plots of number of rewteets by topics:

In [None]:
fig, axs = plt.subplots(4, 2,figsize=(15,20))
plt.title('Number of retweets per categories',fontsize=25)
ax0 = sns.pointplot(data=comparison_covid, ax=axs[0, 0], estimator='mean', errorbar=('ci', 95), color = 'r')
ax0 .set(title='Comparison of retweets (errorbar = CI 95%)')
ax0 .set(ylabel='Number of retweets')

ax1 = sns.pointplot(data=comparison_Leaktheory,ax=axs[0, 1], estimator='mean', errorbar=('ci', 95), color = 'r')
ax1 .set(title='Comparison of retweets (errorbar = CI 95%)')
ax1 .set(ylabel='Number of retweets')

ax2 = sns.pointplot(data=comparison_Swineflu, ax=axs[1, 0], estimator='mean', errorbar=('ci', 95), color = 'r')
ax2 .set(title='Comparison of retweets (errorbar = CI 95%)')
ax2 .set(ylabel='Number of retweets')

ax3 = sns.pointplot(data=comparison_Hydroxychloroquine, ax= axs[1, 1], estimator='mean', errorbar=('ci', 95), color = 'r')
ax3 .set(title='Comparison of retweets (errorbar = CI 95%)')
ax3 .set(ylabel='Number of retweets')

ax4 = sns.pointplot(data=comparison_biden,ax=axs[2, 0], estimator='mean', errorbar=('ci', 95), color = 'r')
ax4 .set(title='Comparison of retweets (errorbar = CI 95%)')
ax4 .set(ylabel='Number of retweets')

ax5 = sns.pointplot(data=comparison_democrats, ax=axs[2, 1], estimator='mean', errorbar=('ci', 95), color = 'r')
ax5 .set(title='Comparison of retweets (errorbar = CI 95%)')
ax5 .set(ylabel='Number of retweets')

ax6 = sns.pointplot(data=comparison_vaccines, ax=axs[3, 0], estimator='mean', errorbar=('ci', 95), color = 'r')
ax6 .set(title='Comparison of retweets (errorbar = CI 95%)')
ax6 .set(ylabel='Number of retweets')

ax7 = sns.pointplot(data=comparison_fake_news, ax=axs[3, 1], estimator='mean', errorbar=('ci', 95), color = 'r')
ax7 .set(title='Comparison of retweets (errorbar = CI 95%)')
ax7 .set(ylabel='Number of retweets')

plt.savefig('retweets_per_categories')

## Regression to disentangle categories into predicting the number of retweets:

In [None]:
mod_retweets = smf.ols(formula='retweets ~ C(covid_related)+C(is_fake_news)+C(sentiment_type)', data=data)

In [None]:
res_retweets = mod_retweets.fit()
print(res_retweets.summary())

## Compare number of retweets per categories :

In [None]:
compare_retweets = pd.DataFrame({'Democrats':data_democrats_rel.retweets,'Biden':data_biden_rel.retweets,
                    'Covid':data_covid_rel.retweets,'Swine flu':data_Swineflu_rel.retweets,
                     'Leak Theory':data_Leaktheory_rel.retweets,'Hydroxychloroquine':data_Hydroxychloroquine_rel.retweets,
                               'Vaccines':data_vaccines_rel.retweets})
sns.color_palette("colorblind", 8)
plt.figure(figsize=(15,5))
ax = sns.barplot(compare_retweets)
ax.set(xlabel='Categories', ylabel='Number of rewtweets')
ax.set_title("Number of average retweets for some identified categories",fontsize=25)
plt.savefig('barplot_per_categories')

# Studying sentiment

## Sentiment distributions

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=1, sharex=True, sharey=False, figsize=(4, 4))

var = 'sentiment_type'
colors = ['lightgreen','#FFCCCC', 'lightyellow']
red_color = '#8C1515'  # Dark Red or Maroon
green_color = '#556B2F'  # Dark Green or Olive Green
yellow_color = '#B8860B'  # Dark Yellow or Gold
colors = [green_color,red_color,yellow_color]

print(data[var].value_counts())
plt.pie(data[var].value_counts(), labels=['Positive', 'Negative','Neutral'], autopct='%1.1f%%', startangle=90, colors=colors)
plt.title(f'Distribution of sentiments',fontsize=25)

plt.tight_layout()
plt.savefig('sentiment_pie_chart', bbox_inches='tight')

**DT is actually a positive guy!**

Show a few tweets:

In [None]:
data[data.sentiment_type=='POSITIVE'].text

In [None]:
data[data.sentiment_type=='NEUTRAL'].text

In [None]:
data[data.sentiment_type=='NEGATIVE'].text

## Retweets per sentiment :

In [None]:
plt.figure(figsize=(15,5))
sns.pointplot(data,x='sentiment_type',y='retweets')
plt.title('Number of retweets per sentiment category',fontsize=25)
plt.xlabel('Sentiment type',fontsize=20)
plt.ylabel('Retweets',fontsize=20)
plt.savefig('retweet_per_sentiment')

## Study relation between categories and sentiments:

In [None]:
pivot_dem = pd.crosstab(data[data['democrats']==True].democrats, data['sentiment_type'])
pivot_biden = pd.crosstab(data[data['biden']==True].biden, data['sentiment_type'])
pivot_covid = pd.crosstab(data[data['covid_related']==True].covid_related, data['sentiment_type'])
pivot_fake_news = pd.crosstab(data[data['is_fake_news']==True].is_fake_news, data['sentiment_type'])
pivot_vaccines = pd.crosstab(data[data['vaccines']==True].vaccines, data['sentiment_type'])

pivot_df = pd.concat([pivot_dem,pivot_biden,pivot_covid,pivot_fake_news,pivot_vaccines])
proportion_df = 100*pivot_df.div(pivot_df.sum(axis=1), axis=0)

In [None]:
plt.figure(figsize=(15,5))
ax = proportion_df.plot(kind='bar', stacked=True, colormap='viridis', figsize=(10, 6))
plt.title("Stacked Bar plot of sentiment's proportion for each topic",fontsize=25)
ax.set_xticklabels(['Democrats related','Biden related','Covid related','Fake news related','Vaccines related'])
plt.xlabel('Topics',fontsize=20)
plt.ylabel('Percentage',fontsize=20)
plt.xticks(rotation=45)
plt.legend(title='Sentiment Type', bbox_to_anchor=(1.05, 1), loc='upper left',fontsize=15)
plt.savefig('stacked_plot_sentiment_categories', bbox_inches='tight')
plt.tight_layout()

**Caution: some 'positive' are actually sarcastic:**

In [None]:
data_biden_rel[data_biden_rel.sentiment_type=='POSITIVE'].text[1720]

## Regression with one hotted categories :

In [None]:
data['POSITIVE'] = data['sentiment_type']=='POSITIVE'
data['NEGATIVE'] = data['sentiment_type']=='NEGATIVE'

In [None]:
mod_retweets = smf.ols(formula='retweets ~ +C(covid_related)+C(is_fake_news)+C(NEGATIVE)+C(POSITIVE)', data=data)
res_retweets = mod_retweets.fit()
print(res_retweets.summary())

Covid ot big impacts but democrats does, negative has a significant positive impact
how to make meaningful regression?