In [None]:
import re
import pickle
from glob import glob
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression,Lasso
from sklearn.metrics import r2_score
import bertopic
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

from IPython.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

In [None]:
print(pd.__version__)
import scipy
print(scipy.__version__)

# data from Ashwin + plots

In [None]:
concern_cols = ['lockdowns','masking','vaccines','therapeutics','education']
mf_cols = ['care', 'harm', 'fairness', 'cheating', 'loyalty', 'betrayal', 'authority','subversion'] # , 'purity', 'degradation'

In [None]:
data_dir = '../covid_data/ts_from_ashwin/'
files = glob(data_dir + '*.csv')

dfs = {}
filenames = []
for file in files:
    filename = " ".join(file.split('/')[-1].split('.')[0].split('_')[:2])
    filenames.append(filename)
    print(filename)
    tmp = pd.read_csv(file)
    tmp['date'] = pd.to_datetime(tmp['date'])
    tmp = tmp[tmp.date >= pd.Timestamp('2020-02-01')]
    tmp = tmp.set_index('date')
    dfs[filename] = tmp

In [None]:
dfs_div = {}
for c in concern_cols:
    dfs_div[c] = dfs['lib '+c] - dfs['cons '+c]

In [None]:
dfs_div['lockdowns']

## plot

In [None]:
def plot_separate_ts(df,columns,events=[]):
    colors = ['coral','navy','turquoise','forestgreen','gold','darkviolet','sienna','grey']
    for c,color in zip(columns,colors):
        plt.figure()
        df[c].plot(figsize=[8,3],kind='line',color=color)
        plt.axhline(y=0,color='black')
        for e in events:
            plt.axvline(pd.Timestamp(e[0]),label=e[1],color='black',linestyle=e[2])
        plt.ylabel('Conservative <=> Liberal')
        plt.title(c)
        plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15),
          fancybox=True, shadow=True, ncol=3)
        plt.show()

In [None]:
def plot_ts(df,columns,events=[],title=''):
    plt.figure()
    colors = ['coral','navy','turquoise','forestgreen','gold','darkviolet','sienna','grey']# plt.cm.Set2.colors[3:6]+ ('red','dodgerblue','green','darksalmon','grey') # ['coral','navy','turquoise','forestgreen','gold','darkviolet','sienna']
    for c,color in zip(columns,colors): 
        df[c].plot(figsize=[20,6],kind='line',label=c) # ,color=color
    for e in events:
        plt.axvline(pd.Timestamp(e[0]),color='black',linestyle=e[2]) #,label=e[1])
    plt.title(title)
    plt.ylabel('Fraction of tweets')
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15),
      fancybox=True, shadow=True, ncol=1)

In [None]:
def plot_compare_ts(df_control,df_target,columns,events=[]):
    for c in columns:
        plt.figure(figsize=[20,6])
        df_control[c].plot(kind='line',color='blue',label='Liberals')
        df_target[c].plot(kind='line',color='red',label='Conservatives')
        for e in events:
            plt.axvline(pd.Timestamp(e[0]),label=e[1],color='black',linestyle=e[2])
        plt.title(c)
        plt.legend()
        plt.show()

# change points

In [None]:
import kats
from kats.utils.decomposition import TimeSeriesDecomposition
from kats.consts import TimeSeriesData
from kats.detectors.cusum_detection import CUSUMDetector
from kats.detectors.bocpd import BOCPDetector, BOCPDModelType, TrendChangeParameters

# use pyenv environment disentangle_emotions
# scipy version = 1.7.3
# pandas version = 1.3.5

In [None]:
def changepoint_detection(df,time_colname,var_colnames,title=''):
    changepoints = {}
    for c in var_colnames:
        # construct ts
        tmp_ts = df.reset_index()
        tmp_ts = tmp_ts[[time_colname,c]]
        tmp_ts[time_colname] = tmp_ts[time_colname].astype(str)
        tmp_ts.columns = ['time','value']
        ts = TimeSeriesData(tmp_ts)
        
        cp_list = []

        detector = BOCPDetector(ts)
        # BOCPD - assume normal distri
        cp_list.extend(detector.detector(
            model=BOCPDModelType.NORMAL_KNOWN_MODEL,
            changepoint_prior = 0.03,
            threshold=0.6
        ))
#         # BOCPD - assume ordinary linear reg. 
#         cp_list.extend(detector.detector(
#             model=BOCPDModelType.TREND_CHANGE_MODEL,
#             model_parameters=TrendChangeParameters(
#                 readjust_sigma_prior=True, num_points_prior=14
#                 ),
#                 debug=True,
#                 threshold=0.6,
#                 choose_priors=False,
#                 agg_cp=True
#         ))
    
        # CUSUM - multiple change points
        historical_window = 14
        scan_window = 7
        step = 5
        cpts = []
        n = len(ts)
        for end_idx in range(historical_window + scan_window, n, step):
            tsd = ts[end_idx - (historical_window + scan_window) : end_idx]
            cpts += CUSUMDetector(tsd).detector(interest_window=[historical_window, historical_window + scan_window])
        
        # Plot the data, add results
        plt.figure(figsize=[20,3])
        plt.title(title+" - "+c)
        detector.plot(cp_list)
        plt.figure(figsize=[20,3])
        detector1 = CUSUMDetector(ts) # we are not really using this detector
        detector1.detector()
        detector1.plot(cpts)
        cp_list.extend(cpts)

        cleaned_list = []
        for j in cp_list:
            try:
                time = pd.Timestamp(j[0].start_time,tz='utc')
            except:
                time = pd.Timestamp(j[0].start_time)
            cleaned_list.append((time,j[0].confidence))
            print(time, j[0].confidence)
        
        idx_to_remove = []
        for j in range(1,len(cleaned_list)):
            if abs((cleaned_list[j][0] - cleaned_list[j-1][0]).days) <= 5:
                idx_to_remove.append(j)
        print(idx_to_remove)
        cleaned_list = [item for idx,item in enumerate(cleaned_list) if idx not in idx_to_remove]
        print(cleaned_list)
            
        changepoints[c] = cleaned_list
    return changepoints

In [None]:
# BOCPD-normal + CUSUM
changepoints = {}

# concerns
for concern in concern_cols:
    print(concern)
    
    changepoints[concern] = changepoint_detection(dfs_div[concern].reset_index(),'date',mf_cols,title=concern)

In [None]:
changepoints

In [None]:
with open('../covid_data/changepoints_covid.pkl','wb') as f:
    pickle.dump(changepoints,f)

In [None]:
with open('../covid_data/changepoints_covid.pkl','rb') as f:
    changepoints = pickle.load(f)

## measuring changes

In [None]:
def detect_event(time,events):
    for e in events:
        if time >= pd.Timestamp(e[0])-pd.Timedelta(1,unit='D') and time <= pd.Timestamp(e[0])+pd.Timedelta(1,unit='D'):
            event_date = e[0]
            return True,e
    return False,None

In [None]:
def measure_mean_change(df_ts,time_colname,var_colname,event_time,before_window=7,after_window=7):
    try:
        event_time = pd.Timestamp(event_time, tz='utc')
    except:
        event_time = pd.Timestamp(event_time)
    start_time = event_time - pd.Timedelta(before_window,unit='D')
    end_time = event_time + pd.Timedelta(after_window,unit='D')

    before_mean = df_ts.loc[(df_ts[time_colname]>=start_time) & (df_ts[time_colname]<event_time),var_colname].mean()
    after_mean = df_ts.loc[(df_ts[time_colname]>=event_time) & (df_ts[time_colname]<end_time),var_colname].mean()
    
    return (after_mean-before_mean)/before_mean*100

In [None]:
import statsmodels.formula.api as smf

def measure_slope_change(df_ts,time_colname,var_colname,event_time,before_window=7,after_window=7,mode='kink',plot=True):
    if mode == 'jump':
        effect_coef = 'threshold'
    elif mode == 'kink':
        effect_coef = 'date_to_int:threshold'
    
    try:
        event_time = pd.Timestamp(event_time, tz='UTC')
    except:
        event_time = pd.Timestamp(event_time)
    start_time = event_time - pd.Timedelta(before_window,unit='D')
    end_time = event_time + pd.Timedelta(after_window,unit='D')
    
    df = df_ts[(df_ts[time_colname]>=start_time) & (df_ts[time_colname]<=end_time)]
    df = df.sort_index()
    df['date_to_int'] = list(range(len(df)))
    event_idx = df.loc[df[time_colname]==event_time,'date_to_int'].item()
    df['date_to_int'] = df['date_to_int'] - event_idx # make date_to_int of the event zero
    df = df.assign(threshold=(df['date_to_int'] > 0).astype(int))
    
    model = smf.wls("Q('"+var_colname+"')~date_to_int*threshold", df).fit()
    ate_pct = round(100*((model.params[effect_coef] + model.params["Intercept"])/model.params["Intercept"] - 1),2)

    # plot each regression - data and prediction
#     if plot:
#         plt.figure()
#         df_ = df.copy()
#         df_['predictions'] = model.predict()
#         df_.plot(x=time_colname, y="predictions", color="red")
#         df_.plot(kind='scatter',x=time_colname, y=var_colname)
#         plt.title(var_colname+f" ATE={ate_pct}%")

#     res.loc['effect',c] = model.params[effect_coef]
#     res.loc['p-val',c] = model.pvalues[effect_coef]
#     res.loc['std_err',c] = model.bse[effect_coef]
#     res.loc['change(%)',c] = ate_pct
    
    return ate_pct, model.pvalues[effect_coef]

In [None]:
relevant_changes = pd.DataFrame()
for concern,v in changepoints.items():
    tmp_div = dfs_div[concern].reset_index()
    for var,changes in v.items():
        for c in changes:
            # is_valid_cp,event = detect_event(c[0].start_time, events)
            # if is_valid_cp: # np.datetime64
            mean_change = measure_mean_change(tmp_div,'date', var, c[0])
            slope_change,pval = measure_slope_change(tmp_div,'date', var, c[0])
            relevant_change = {
                'concern':concern,
                'variable': var,
                'event': None, # event[1],
                'event_date': c[0],
                'change_point_confidence':c[1],
                '%change_in_mean':mean_change,
                '%change_in_slope':slope_change,
                'slope_change_pval':pval
            }
            relevant_changes = relevant_changes.append(relevant_change,ignore_index=True)
            
with open('../covid_data/changes_measured_covid.pkl','wb') as f:
    pickle.dump(relevant_changes,f)

In [None]:
with open('../covid_data/changes_measured_covid.pkl','rb') as f:
    relevant_changes = pickle.load(f)
relevant_changes

In [None]:
relevant_changes[(relevant_changes.concern=='all')& (relevant_changes.variable=='fairness')]

# topic modeling

scipy version. = 1.10.1
pandas version = 1.5.3

In [None]:
df = pd.read_csv('../covid_data/covid_5perc_sample_200121_200531_processed_w_topics.csv',lineterminator='\n')
df.columns

In [None]:
df['date'] = pd.to_datetime(df['date'])
df = df.merge(users,how='inner',left_on='screen_name',right_on='username')

## bertopic

In [None]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
model = bertopic.backend._utils.select_backend(sentence_model)
topic_model = BERTopic.load('../topic_model_all_fairness_2021-05-29', embedding_model=model)

len(topic_model.probabilities_)

In [None]:
def get_topic_label(idx):
    return topic_model.topic_labels_[idx]

df['topic_idx'] = topic_model.topics_
df['topic'] = df['topic_idx'].apply(get_topic_label)

# df.to_csv('../covid_data/covid_5perc_sample_200121_200531_processed_w_topics.csv',index=False)

In [None]:
# topic_model_all_fairness_2020-06-18
topic_model.get_topics()

In [None]:
# topic_model_all_fairness_2020-08-26
topic_model.get_topics()

In [None]:
# topic_model_all_fairness_2021-01-31
topic_model.get_topics()

In [None]:
# topic_model_all_fairness_2021-04-19
topic_model.get_topics()

In [None]:
# topic_model_all_fairness_2021-05-29
topic_model.get_topics()

In [None]:
with open('../covid_5perc_200121_200531_topics_over_time.pkl','rb') as f:
    topics_over_time = pickle.load(f)

In [None]:
pd.__version__


In [None]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=100)

In [None]:
df['date'] = pd.to_datetime(df['date'])
df['date'] = df['date'].dt.round('D')
df_topic_freq = df.groupby(['topic_idx','date'])['tweetid'].count()
df_topic_freq = df_topic_freq.reset_index()
df_topic_freq.columns = ['topic_idx','date','Frequency']

In [None]:
import matplotlib.pyplot as plt

topics_to_plot = [24]# list(range(10))

colors = plt.cm.tab10.colors
ax = plt.subplot()
for t,c in zip(topics_to_plot,colors):
    tmp = df_topic_freq[df_topic_freq.topic_idx==t].sort_values('date')
    name = topic_model.topic_labels_[t]
    name = "_".join(name.split('_')[1:4])
    print(name)
    tmp.plot('date','Frequency',kind='line',label=name,ax=ax,color=c,figsize=[8,3])
plt.xlabel('')
plt.ylabel('Frequency')
#plt.yscale('log')
# plt.ylim((-10,170))
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15),
          fancybox=True, shadow=True, ncol=3)
plt.show()

## LDA

In [None]:
# https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

import gensim.corpora as corpora
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['rt','covid','coronavirus','people','pandemic'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

def gen_LDA_comparison(df,event_date,title=''):
    before = df[(df.date>=(event_date-pd.Timedelta(5,unit='D'))) & (df.date<event_date)]
    before_text = ' '.join(before['text'].tolist())
    before_words = before_text.split()
    before_words = remove_stopwords(before_words)
    before_id2word = corpora.Dictionary(before_words)
    before_corpus = [before_id2word.doc2bow(text) for text in before_words if len(text)>0]
    num_topics = 20
    before_lda = gensim.models.LdaMulticore(corpus=before_corpus,
                                           id2word=before_id2word,
                                           num_topics=num_topics,
                                           passes=2,
                                           iterations=100)
    before_topics = before_lda.print_topics(num_words=5)

    after = df[(df.date>=event_date) & (df.date<(event_date+pd.Timedelta(5,unit='D')))]
    after_text = ' '.join(after['text'].tolist())
    after_words = after_text.split()
    after_words = remove_stopwords(after_words)
    after_id2word = corpora.Dictionary(after_words)
    after_corpus = [after_id2word.doc2bow(text) for text in after_words if len(text)>0]
    after_lda = gensim.models.LdaMulticore(corpus=after_corpus,
                                           id2word=after_id2word,
                                           num_topics=num_topics,
                                           passes=2,
                                           iterations=100)
    after_topics = after_lda.print_topics(num_words=5)
    
    print(title)
    print('BEFORE:')
    for i in before_topics:
        print(i)
    print('AFTER:')
    for i in after_topics:
        print(i)
    print()

In [None]:
for r,row in relevant_changes.iterrows():
    event_date = row['event_date']
    if event_date > df['date'].max():
        continue
    
    title = row['concern']+' - '+row['variable']+', '+str(row['event_date'].date())+' '+str(round(row['%change_in_mean'],2))+'%'
    
    gen_LDA_comparison(df,event_date,title)

## wordcloud

In [None]:
df = pd.read_csv('../covid_data/covid_5perc_sample_200121_210630_processed.csv',lineterminator='\n')
print(df.columns)
df['date'] = pd.to_datetime(df['date'])
df = df.merge(users,how='inner',left_on='screen_name',right_on='username')

In [None]:
from wordcloud import WordCloud

from collections import Counter

import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['rt','covid','coronavirus','people','pandemic','el','p'])
 
def count_freq(s):
    # break the string into list of words
    str_list = s.split()

    frequency = Counter(str_list)
    
    # remove stop words
    for k in list(frequency.keys()):
        if k in stop_words:
            del frequency[k]

    return frequency

def count_freq_diff(d1,d2):
    diff = {}
    for k,v in d2.items():
        if k in d1.keys():
            if v > d1[k]:
                diff[k] = v - d1[k]
        else:
            diff[k] = v
    return diff

def gen_wordcloud_comparison(df,event_date,title=''):
    before = df[(df.date>=(event_date-pd.Timedelta(5,unit='D'))) & (df.date<event_date)]
    before_text = ' '.join(before['text'].tolist())
    before_freq = count_freq(before_text)
    
    after = df[(df.date>=event_date) & (df.date<(event_date+pd.Timedelta(5,unit='D')))]
    after_text = ' '.join(after['text'].tolist())
    after_freq = count_freq(after_text)
    
    diff_freq = count_freq_diff(before_freq,after_freq)
    diff_wordcloud = WordCloud().generate_from_frequencies(diff_freq)
    
    plt.imshow(diff_wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()
    


def gen_wordcloud_LRcomparison(df,event_date,title=''):
    before = df[(df.date>=(event_date-pd.Timedelta(5,unit='D'))) & (df.date<event_date)]
    beforeL = before[before.political=='left']
    beforeL_text = ' '.join(beforeL['text'].tolist())
    beforeL_freq = count_freq(beforeL_text)
    
    beforeR = before[before.political=='right']
    beforeR_text = ' '.join(beforeR['text'].tolist())
    beforeR_freq = count_freq(beforeR_text)
    # before_wordcloud = WordCloud().generate_from_frequencies(before_freq)
    
    after = df[(df.date>=event_date) & (df.date<(event_date+pd.Timedelta(5,unit='D')))]
    afterL = after[after.political=='left']
    afterL_text = ' '.join(afterL['text'].tolist())
    afterL_freq = count_freq(afterL_text)
    
    afterR = after[after.political=='right']
    afterR_text = ' '.join(afterR['text'].tolist())
    afterR_freq = count_freq(afterR_text)
    #after_wordcloud = WordCloud().generate_from_frequencies(after_freq)
    
#     fig,(ax1,ax2)=plt.subplots(nrows=1,ncols=2)
#     ax1.imshow(before_wordcloud, interpolation='bilinear')
#     ax2.imshow(after_wordcloud, interpolation='bilinear')
#     ax1.axis("off")
#     ax2.axis("off")
    print(len(beforeL_freq),len(beforeR_freq),len(afterL_freq),len(afterR_freq))
    
    diff_freq_RL = count_freq_diff(count_freq_diff(beforeR_freq,beforeL_freq),count_freq_diff(afterR_freq,afterL_freq))
    diff_wordcloud_RL = WordCloud().generate_from_frequencies(diff_freq_RL)
    
    plt.imshow(diff_wordcloud_RL, interpolation='bilinear')
    plt.axis('off')
    plt.title(title+', Left says more')
    plt.show()
    
    diff_freq_LR = count_freq_diff(count_freq_diff(beforeL_freq,beforeR_freq),count_freq_diff(afterL_freq,afterR_freq))
    diff_wordcloud_LR = WordCloud().generate_from_frequencies(diff_freq_LR)
    
    plt.imshow(diff_wordcloud_LR, interpolation='bilinear')
    plt.axis('off')
    plt.title(title+', Right says more')
    plt.show()

In [None]:
# the most different words comparing before and after an event
for r,row in relevant_changes.iterrows():
    event_date = row['event_date']
    if event_date > df['date'].max():
        continue
    
    title = row['concern']+' - '+row['variable']+', '+str(row['event_date'].date())+', divergence (L-R) change='+str(round(row['%change_in_mean'],2))+'%'
    
    gen_wordcloud_comparison(df,event_date,title)
    
    plt.show()
    

In [None]:
# the most different words (beforeL-beforeC)-(afterL-afterC)
for r,row in relevant_changes.iterrows():
    event_date = row['event_date']
    if event_date > df['date'].max():
        continue
    
    title = row['concern']+' - '+row['variable']+', '+str(row['event_date'].date())#+
    print('divergence (L-R) change='+str(round(row['%change_in_mean'],2))+'%')
    gen_wordcloud_LRcomparison(df,event_date,title)
    #gen_wordcloud_LRcomparison(df[row.concern]==1,event_date,title)
    
    plt.show()

In [None]:
# the most different words (beforeL-beforeC)-(afterL-afterC)
for r,row in relevant_changes.iterrows():
    event_date = row['event_date']
    if event_date > df['date'].max():
        continue
    df_tmp = df[df[row.variable]==1]
    
    title = row['concern']+' - '+row['variable']+', '+str(row['event_date'].date())#+
    print('divergence (L-R) change='+str(round(row['%change_in_mean'],2))+'%')
    gen_wordcloud_LRcomparison(df_tmp,event_date,title)
    #gen_wordcloud_LRcomparison(df[row.concern]==1,event_date,title)
    
    plt.show()