In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(15,8)})
from datetime import datetime
import re
import json
import spacy
import string
import wordcloud 
from wordcloud import WordCloud
from causalimpact import CausalImpact

In [2]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/Etienne/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
# build a personalyze data parser
custom_date = lambda x: datetime.strptime(x.split()[0], '%Y-%m-%d')
df = pd.read_csv('tweets.csv', parse_dates=['date'], date_parser=custom_date)
df = df[df['date'] >= '2019-12-01']
df.sort_values(by='date')

FileNotFoundError: [Errno 2] No such file or directory: 'tweets.csv'

In [None]:
df['scores'] = df['text'].apply(lambda Description: sid.polarity_scores(Description))  
df.head()

In [None]:
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])
df['sentiment_type']=''
df.loc[df.compound>0,'sentiment_type']='POSITIVE'
df.loc[df.compound==0,'sentiment_type']='NEUTRAL'
df.loc[df.compound<0,'sentiment_type']='NEGATIVE'
df['month_date'] = df.date.apply(lambda x: x.month)
df['year_date'] = df.date.apply(lambda x: x.year)
df['daily'] = df.date.apply(lambda x: x.day)
df['week'] = df['date'].dt.isocalendar().week

In [None]:
freq_d = df.groupby(by=['year_date','month_date','daily']).apply(lambda x: x.compound.mean())
freq_d = freq_d.reset_index()
freq_d = freq_d.rename(columns={0: 'compound'})
num_d = df.groupby(by=['year_date','month_date','daily']).apply(lambda x: x.compound.count())


freq_w = df.groupby(['year_date','month_date','week']).apply(lambda x: x.compound.mean())
freq_w = freq_w.reset_index()
freq_w = freq_w.rename(columns={0: 'compound'})

freq_m = df.groupby(by=['year_date','month_date']).apply(lambda x: x.compound.mean())
freq_m = freq_m.reset_index()
freq_m = freq_m.rename(columns={0: 'compound'})

In [None]:
# drop week first last they shit
freq_w = freq_w.iloc[1:-1]

In [None]:
freq_m.year_date = freq_m.year_date.astype(str)
freq_m.month_date = freq_m.month_date.astype(str)
freq_m['date'] = freq_m.apply(lambda x: datetime.strptime(str(x.year_date) + '-' + str(x.month_date), '%Y-%m'), axis=1)

freq_d.year_date = freq_d.year_date.astype(str)
freq_d.month_date = freq_d.month_date.astype(str)
freq_d.daily = freq_d.daily.astype(str)
freq_d['date'] = freq_d.apply(lambda x: datetime.strptime(str(x.year_date) + '-' + str(x.month_date) + '-' + str(x.daily), '%Y-%m-%d'), axis=1)

freq_w.year_date = freq_w.year_date.astype(str)
freq_w.month_date = freq_w.month_date.astype(str)
freq_w.week = freq_w.week.astype(str)
freq_w['date'] = pd.to_datetime(freq_w['year_date'].astype(str) + freq_w['week'].astype(str) + '-1', format='%G%V-%u')

In [None]:
plt.figure(figsize=(10, 6))

# Plotting freq_m
plt.plot(freq_m['date'], freq_m['compound'], label='Monthly' , linestyle='-')

# Plotting freq_d
plt.plot(freq_d['date'], freq_d['compound'], label='Daily', linestyle='-')

# Plotting freq_w
plt.plot(freq_w['date'], freq_w['compound'], label='Weekly', linestyle='-')

plt.xlabel('Date')
plt.ylabel('Compound Value')
plt.title('Compound Values Over Time')
plt.legend()
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability if needed
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import os
combined_df = pd.DataFrame()
file_path = 'data'
files = [file for file in os.listdir(file_path)]
# Create a list of DataFrames by reading each CSV file
dataframes = [pd.read_csv(file_path+'/'+file) for file in files]

# Concatenate the list of DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)
      


In [None]:
combined_df.columns

In [None]:
combined_df.user_location.value_counts()
US_df = combined_df[combined_df['user_location']=='United States']

US_df['created_at'] = pd.to_datetime(US_df['created_at'], errors='coerce')

# Extract year, month, and day components
US_df['year'] = US_df['created_at'].dt.year
US_df['month'] = US_df['created_at'].dt.month
US_df['day'] = US_df['created_at'].dt.day

# Combine year, month, and day components to create a new 'date' column
US_df['date'] = pd.to_datetime(US_df[['year', 'month', 'day']], errors='coerce')

# Drop intermediate columns ('year', 'month', 'day', 'created_at')
US_df.drop(columns=['year', 'month', 'day', 'created_at'], inplace=True)


In [None]:
! pip install nltk

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [None]:
US_df['scores'] = US_df['text'].apply(lambda review: sid.polarity_scores(review))

In [None]:
US_df['compound']  = US_df['scores'].apply(lambda score_dict: score_dict['compound'])

In [None]:
US_df=US_df[US_df['compound']!=0]
US_df_agg = US_df['compound'].groupby(US_df['date']).mean().reset_index()

In [None]:
#subset old one
start_date = '2020-03-19'
end_date = '2020-04-18'
freq_d_sub = freq_d[(freq_d['date'] >= start_date) & (freq_d['date'] <= end_date)]


In [None]:
plt.plot(US_df_agg['date'], US_df_agg['compound'], label='people', linestyle='-')
plt.plot(freq_d_sub['date'], freq_d_sub['compound'], label='trump', linestyle='-')
plt.xlabel('Date')
plt.ylabel('Compound Value')
plt.title('Compound Values Over Time')
plt.legend()
plt.show()

In [None]:
merged_df = pd.merge(US_df_agg, freq_d_sub, on='date', how='inner')

In [None]:
merged_df.columns

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

mod = smf.ols(formula='compound_y ~ compound_x', data=merged_df)
np.random.seed(2)
res = mod.fit()
print(res.summary())



In [None]:
# now look at covid tweets
#create a list of words related to covid
list_of_words = ['Covid19' ,'virus', 'vaccine', 'Flu', 'Covid', 'lockdown', 'pandemic', 'coronavirus', 'Coronavirus', 'Corona', 'corona', 'COVID19', 'COVID', 'covid', 'quarantine', 'Quarantine', 'quarentine', 'Quarentine', 'quarantined', 'Quarantined', 'quarentined', 'Quarentined', 'quarantining', 'Quarantining', 'quarentining', 'Quarentining', 'quarantines', 'Quarantines', 'quarentines', 'Quarentines', 'quarantine', 'Quarantine', 'quarentine', 'Quarentine', 'quarantining', 'Quarantining', 'quarentining', 'Quarentining', 'quarantines', 'Quarantines', 'quarentines', 'Quarentines', 'quarantine', 'Quarantine', 'quarentine', 'Quarentine', 'quarantining', 'Quarantining', 'quarentining', 'Quarentining', 'quarantines', 'Quarantines', 'quarentines', 'Quarentines', 'quarantine', 'Quarantine', 'quarentine', 'Quarentine', 'quarantining', 'Quarantining', 'quarentining', 'Quarentining', 'quarantines', 'Quarantines', 'quarentines', 'Quarentines']
df['covid_related'] = df['text'].apply(lambda x: True if any(word in x for word in list_of_words) else False)

In [None]:
#data where covid_related is true
data_covid_rel = df[df['covid_related']==1]

In [None]:
freq_dc = data_covid_rel.groupby(by=['year_date','month_date','daily']).apply(lambda x: x.compound.mean())
freq_dc = freq_dc.reset_index()
freq_dc = freq_dc.rename(columns={0: 'compound'})

In [None]:
#repeat
freq_dc.year_date = freq_dc.year_date.astype(str)
freq_dc.month_date = freq_dc.month_date.astype(str)
freq_dc.daily = freq_dc.daily.astype(str)
freq_dc['date'] = freq_dc.apply(lambda x: datetime.strptime(str(x.year_date) + '-' + str(x.month_date) + '-' + str(x.daily), '%Y-%m-%d'), axis=1)

In [None]:
freq_d_subc = freq_dc[(freq_dc['date'] >= start_date) & (freq_dc['date'] <= end_date)]

In [None]:
plt.plot(US_df_agg['date'], US_df_agg['compound'], label='people', linestyle='-')
plt.plot(freq_d_subc['date'], freq_d_subc['compound'], label='trump', linestyle='-')
plt.xlabel('Date')
plt.ylabel('Compound Value')
plt.title('Compound Values Over Time')
plt.legend()
plt.show()

In [None]:
merged_dfc = pd.merge(US_df_agg, freq_d_subc, on='date', how='inner')

In [None]:
merged_dfc.columns

In [None]:
modc = smf.ols(formula='compound_y ~ compound_x', data=merged_dfc)
np.random.seed(2)
res2 = modc.fit()
print(res2.summary())

In [None]:
covid = pd.read_csv('timesco/time-series-19-covid-combined.csv')
covid.head()

cov_us = covid[covid['Country/Region']=='US']
sub_cov = cov_us[(cov_us['Date'] >= start_date) & (cov_us['Date'] <= end_date)]

In [None]:
sub_cov.columns

In [None]:
trump_sent=merged_dfc['compound_y'].to_list()
peopl_sent=merged_dfc['compound_x'].to_list()


In [None]:
sub_cov['trump_sent'] = trump_sent
sub_cov['peopl_sent'] = peopl_sent

In [None]:
modm = smf.ols(formula='trump_sent ~ Confirmed + Recovered + Deaths ', data=sub_cov)
np.random.seed(2)
res3 = modm.fit()
print(res3.summary())