## Imports :

In [1]:
# modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(15,8)})
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(15,8)})
from datetime import datetime, timedelta
from causalimpact import CausalImpact
import statsmodels.formula.api as smf

# helpers
from helpers.wiki_gtrend_visualization import load_wikipedia_df
from helpers.gtrend_visualization import load_gtrend_df, load_gtrend_hourly_df
from helpers.load_topic_hourly import request_gtrends

# Part 0: Data Pre-processing

In [2]:
tweets_df = pd.read_csv('data/tweets.csv', parse_dates=['date'], date_format='%Y-%m-%d %H:%M:%S')

This dataset contains all his tweets and retweets from 2009 to January, 8 2021 (one day before he was banned from Twitter) :

In [3]:
tweets_df.loc[1, 'date']

Timestamp('2020-03-03 01:34:50')

Let's focus on the Covid period:

In [4]:
tweets_covid_df = tweets_df[tweets_df['date'] >= '2019-12-01'].copy()
tweets_covid_df.sort_values(by='date')
tweets_covid_df.head()

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
1,1234653427789070336,I was thrilled to be back in the Great city of...,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f
2,1218010753434820614,RT @CBS_Herridge: READ: Letter to surveillance...,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f
3,1304875170860015617,The Unsolicited Mail In Ballot Scam is a major...,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f
4,1218159531554897920,RT @MZHemingway: Very friendly telling of even...,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f
5,1217962723234983937,RT @WhiteHouse: President @realDonaldTrump ann...,t,f,Twitter for iPhone,0,25048,2020-01-17 00:11:56,f


# Part 1: Granger Causality

### A) Extract Relevant Tweets

Let's see the number of tweets where Hydroxychloroquine is mentioned.

In [5]:
# Pinpoint all tweets containing the key words
fake_news = ['Hydroxychloroquine', 'hydroxychloroquine']
analyze = tweets_covid_df['text'].apply(lambda x: True if any(word in x for word in fake_news) else False)

# Create a column with a boolean indicating whether tweet contains a key word
new_col = analyze.reindex(tweets_covid_df.index, fill_value=False)
tweets_covid_df['Hydroxychloroquine']=new_col

print('Number of tweets talking about Hydroxychloroquine is: ', tweets_covid_df['Hydroxychloroquine'].sum())

Number of tweets talking about Hydroxychloroquine is:  22


In [13]:
# Get the datetimes where trump spoke about Hydroxychloroquine
hydro_tweets_times = tweets_covid_df[tweets_covid_df['Hydroxychloroquine'] == True]['date'].copy()
hydro_tweets_times = hydro_tweets_times.sort_values()
hydro_tweets_times = hydro_tweets_times.reset_index(drop=True)
hydro_tweets_times = hydro_tweets_times.dt.normalize() # set times to 00:00:00
hydro_tweets_times.head()

0   2020-03-21
1   2020-03-24
2   2020-04-04
3   2020-04-05
4   2020-04-10
Name: date, dtype: datetime64[ns]

### B) Plot Wiki Time Series

In [None]:
pagename = 'Hydroxychloroquine'
folder = 'data'
filename = 'hydroxychloroquine'

_, daily_wikipedia_hydro_df = load_wikipedia_df(folder,filename)

daily_wikipedia_hydro_df = daily_wikipedia_hydro_df.drop('Week', axis=1)

fig, axis = plt.subplots(figsize=(10,5))
fig.tight_layout(pad=3.0)

fig.suptitle('Daily searches related to {pagename} - Wikipedia'.format(pagename=pagename), size=20)
fig.subplots_adjust(top=0.85)

axis.set_title('Wikipedia', size=15)
axis.set_ylabel('Relative search interest')
axis.plot(daily_wikipedia_hydro_df['Views'])
[axis.axvline(x=critical_date, color='red', linestyle='--') for critical_date in hydro_tweets_times]

axis.set_xlabel('Date')

plt.show()

### C) Granger

In [None]:
# Put the tweets in a format compatible with Granger causality analysis (i.e. time series)
tweets_hydro_timeseries_df = pd.DataFrame(index=daily_wikipedia_hydro_df['Views'].index, columns=['Date'])

for date in hydro_tweets_times:
    tweets_hydro_timeseries_df[date] = (tweets_hydro_timeseries_df.index == date).astype(int)

tweets_hydro_timeseries_df['Date'].fillna(0, inplace=True)
tweets_hydro_timeseries_df['Tweet'] = tweets_hydro_timeseries_df.iloc[:, 1:].sum(axis=1)

# Combine wiki and tweets
wiki_tweets_hydro_df = daily_wikipedia_hydro_df.copy()
wiki_tweets_hydro_df['Tweets'] = tweets_hydro_timeseries_df['Tweet']
wiki_tweets_hydro_df['Tweets'].fillna(0, inplace=True)
display(wiki_tweets_hydro_df.sample(5))

In [None]:
# Granger test between GTrends views and tweets
import statsmodels.api as sm
from statsmodels.tsa.stattools import grangercausalitytests

print("DO VIEWS CAUSE TWEETS?")
grangercausalitytests(wiki_tweets_hydro_df, 2)

print('\n --------------------------------')
print("\n DO TWEETS CAUSE VIEWS?")
tweets_cause_views_gtrends = grangercausalitytests(wiki_tweets_hydro_df[['Tweets', 'Views']], 2)

# Part 2: Causal Impact

### A) Plot GTrends Time Series

In [None]:
pagename = 'Hydroxychloroquine'
folder = 'data'
filename = 'hydroxychloroquine'

hydro_gtrends_df = load_gtrend_df(folder, filename)

fig, axis = plt.subplots(figsize=(10,5))
fig.tight_layout(pad=3.0)

fig.suptitle('Weekly searches related to {pagename} - Google Trends'.format(pagename=pagename), size=20)
fig.subplots_adjust(top=0.85)

axis.set_title('Google Trends', size=15)
axis.set_ylabel('Relative search interest')
axis.plot(hydro_gtrends_df['Views'])
[axis.axvline(x=critical_date, color='red', linestyle='--') for critical_date in hydro_tweets_times]

axis.set_xlabel('Date')

### B) Fetch GTrends Time Series at Hourly Granularity

In [15]:
# QUESTION (SABRI): SHOULD WE KEEP THIS IN THE NOTEBOOK?

# Select tweet number
tweet_nb = 0
alias = f"tweet_{tweet_nb}"

# Request data from GTrends API (both hydro and Google) and save it
tweet_time = hydro_tweets_times[tweet_nb].replace(minute=0, second=0)
print(f"Tweet time: {tweet_time}")
start_time = tweet_time - timedelta(hours=48)
end_time = tweet_time + timedelta(hours=48)

print(f"start time: {start_time}")
print(f"end time: {end_time}")

# Hydro data 
folder = 'data'
filename = 'hydroxychloroquine'
pagename = 'Hydroxychloroquine'
request_gtrends(folder, filename, alias, pagename, start_time, end_time)

# Control data
folder = 'data'
filename = 'climate'
pagename = 'Climate'
request_gtrends(folder, filename, alias, pagename, start_time, end_time)

folder = 'data'
filename = 'coffee'
pagename = 'Coffee'
request_gtrends(folder, filename, alias, pagename, start_time, end_time)

folder = 'data'
filename = 'news'
pagename = 'News'
request_gtrends(folder, filename, alias, pagename, start_time, end_time)

folder = 'data'
filename = 'shop'
pagename = 'Shop'
request_gtrends(folder, filename, alias, pagename, start_time, end_time)

folder = 'data'
filename = 'time'
pagename = 'Time'
request_gtrends(folder, filename, alias, pagename, start_time, end_time)

Tweet time: 2020-03-21 00:00:00
start time: 2020-03-19 00:00:00
end time: 2020-03-23 00:00:00


FileNotFoundError: [Errno 2] No such file or directory: '../data/hydroxychloroquine/hydroxychloroquine_googletrends_tweet_0.json'

In [None]:
# Load hydro json
folder = 'data'
filename = 'hydroxychloroquine'
gtrends_hydro_df = load_gtrend_hourly_df(folder, filename, alias)

# Load control json
folder = 'data'
filename = 'climate'
gtrends_climate_df = load_gtrend_hourly_df(folder, filename, alias)

folder = 'data'
filename = 'coffee'
gtrends_coffee_df = load_gtrend_hourly_df(folder, filename, alias)

folder = 'data'
filename = 'news'
gtrends_news_df = load_gtrend_hourly_df(folder, filename, alias)
folder = 'data'
filename = 'shop'
gtrends_shop_df = load_gtrend_hourly_df(folder, filename, alias)

folder = 'data'
filename = 'time'
gtrends_time_df = load_gtrend_hourly_df(folder, filename, alias)

# Combine the dataframes
gtrends_df = pd.concat([gtrends_hydro_df, gtrends_climate_df, gtrends_coffee_df, 
                        gtrends_news_df, gtrends_shop_df, gtrends_time_df],
                         axis=1, keys=['hydro', 'climate', 'coffee', 'news',
                                        'shop', 'time'])
gtrends_df.columns = gtrends_df.columns.droplevel(1)

gtrends_df.head()

### C) Causal Impact

In [None]:
# choose pre and post-periods
pre_period = [gtrends_df.index[0], tweet_time]
post_period = [tweet_time+timedelta(hours=1), gtrends_df.index[-1]]
print(pre_period)
print(post_period)

In [None]:
# Verify assumptions on pre-period: linear regression
mod = smf.ols(formula='hydro ~ coffee + climate + news + shop + time', data=gtrends_df[:pre_period[1]])
res = mod.fit()
res.summary()

In [None]:
# conduct causal
impact = CausalImpact(data = gtrends_df, pre_period=pre_period, post_period=post_period, prior_level_sd=None, model_args={'dynamic_regression': True})
impact.plot()

### D) Alternative Hypothesis

In [None]:
# SABRI QUESTION: KEEP?
# Request data from GTrends API (both hydro and Google) and save it
treatment_time = pd.Timestamp(2020,3,16,18,0,0)
print(f"Tweet time: {treatment_time}")
start_time = pd.Timestamp(2020,3,15,12,0,0)
end_time = pd.Timestamp(2020,3,19,12,0,0)

print(f"start time: {start_time}")
print(f"end time: {end_time}")

alias = "publi"

# Hydro data 
# folder = 'data'
# filename = 'hydroxychloroquine'
# pagename = 'Hydroxychloroquine'
# request_gtrends(folder, filename, alias, pagename, start_time, end_time)

# # Control data
# folder = 'data'
# filename = 'climate'
# pagename = 'Climate'
# request_gtrends(folder, filename, alias, pagename, start_time, end_time)

# folder = 'data'
# filename = 'coffee'
# pagename = 'Coffee'
# request_gtrends(folder, filename, alias, pagename, start_time, end_time)

# folder = 'data'
# filename = 'news'
# pagename = 'News'
# request_gtrends(folder, filename, alias, pagename, start_time, end_time)

# folder = 'data'
# filename = 'shop'
# pagename = 'Shop'
# request_gtrends(folder, filename, alias, pagename, start_time, end_time)

# folder = 'data'
# filename = 'time'
# pagename = 'Time'
# request_gtrends(folder, filename, alias, pagename, start_time, end_time)

In [None]:
# Load hydro json
folder = 'data'
filename = 'hydroxychloroquine'
gtrends_hydro_df = load_gtrend_hourly_df(folder, filename, alias)

# Load control json
folder = 'data'
filename = 'climate'
gtrends_climate_df = load_gtrend_hourly_df(folder, filename, alias)

folder = 'data'
filename = 'coffee'
gtrends_coffee_df = load_gtrend_hourly_df(folder, filename, alias)

folder = 'data'
filename = 'news'
gtrends_news_df = load_gtrend_hourly_df(folder, filename, alias)

folder = 'data'
filename = 'shop'
gtrends_shop_df = load_gtrend_hourly_df(folder, filename, alias)

folder = 'data'
filename = 'time'
gtrends_time_df = load_gtrend_hourly_df(folder, filename, alias)

display(gtrends_climate_df)
display(gtrends_coffee_df)

# Combine the dataframes
gtrends_df = pd.concat([gtrends_hydro_df, gtrends_climate_df, gtrends_coffee_df, 
                        gtrends_news_df, gtrends_shop_df, gtrends_time_df],
                         axis=1, keys=['hydro', 'climate', 'coffee', 'news',
                                        'shop', 'time'])
gtrends_df.columns = gtrends_df.columns.droplevel(1)

gtrends_df.head()

In [None]:
# choose pre and post-periods
pre_period = [gtrends_df.index[0], treatment_time]
post_period = [treatment_time+timedelta(hours=1), gtrends_df.index[-1]]
print(pre_period)
print(post_period)

In [None]:
# Verify assumptions on pre-period: linear regression
mod = smf.ols(formula='hydro ~ coffee + climate + news + shop + time', data=gtrends_df[:pre_period[1]])
res = mod.fit()
res.summary()

In [None]:
# conduct causal
impact = CausalImpact(data = gtrends_df, pre_period=pre_period, post_period=post_period, prior_level_sd=None, model_args={'dynamic_regression': True})
impact.plot()