In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from linearmodels import PanelOLS
from sklearn.decomposition import PCA

In [3]:
#read in most recent final product
df = pd.read_csv('/home/twalton_umass_edu/Political Polarization Project/final/final_product_topic_45_20210427.csv')

#list of schools to loop through
schools = ['s1', 's2', 's3', 's4', 's5', 's6', 's8']

#only want republicans
df = df[df['dem_org'] == 1]

#number of data points
print('total original data points: ' + str(len(df)))
      
#get rid of first three period due to missing ties
df = df[(df['period'] >= 4)]

df = df.groupby(['user_id', 'school']).filter(lambda x: len(x) > 25)
    
#fill in nans with 0
df = df.fillna(0)
      
#create new variable that is the variance of the rep count for each user_id/school pair
df = df.join(df.groupby(['user_id', 'school'])['rep_count'].var(), on = ['user_id', 'school'], rsuffix = '_var')

#create new variable that is the variance of the dem_coutn for each user_id/school pair
df = df.join(df.groupby(['user_id', 'school'])['dem_count'].var(), on = ['user_id', 'school'], rsuffix = '_var')

#create a new variable that is the total number of tweets for the individual
df = df.join(df.groupby(['user_id', 'school'])['num_tweets'].sum(), on = ['user_id', 'school'], rsuffix = '_sum')

#keep usee that have at least both variance or at least one tweet
df = df[(df['num_tweets_sum'] != 0) | ((df['dem_count_var'] != 0) & (df['rep_count_var'] != 0))]

      
topics = {"X0": "education", "X1": "names", "X2": "punish_law", "X3": "racism",
                       "X4": "covid", "X5": "event", "X6": "trump", "X7": "verbs", "X8": "congress_bills",
                       "X9": "spam", "X10": "right_outrage", "X11": "primaries", "X12": "vp_nomination",
                       "X13": "russia", "X14": "hero_RBG_BLM_Lewis", "X15": "arguments", "X16": "fed_investigation",
                       "X17": "social_media", "X18": "healthcare", "X19": "senate_race", "X20": "contractions",
                       "X21": "vote_tomorrow", "X22": "mixture", "X23": "protest", "X24": "scotus",
                       "X25": "institutions", "X26": "climate", "X27": "economy", "X28": "vote_methods",
                       "X29": "knew_had", "X30": "family", "X31": "date_time", "X32": "grateful",
                       "X33": "slang", "X34": "celebrate", "X35": "progress_fight", "X36": "geographic_location",
                       "X37": "food", "X38": "slang_2", "X39": "numbers", "X40": "covid_restrictions", "X41": "trump_2",
                       "X42": "holiday", "X43": "sport", "X44": "news_media", "X45": "no_topic"}
      
#rename topic variables
df = df.rename(columns = topics)

df['log_dem_count'] = np.log(df['dem_count'] + 1)
df['log_rep_count'] = np.log(df['rep_count'] + 1)
df['log_tot_pol'] = np.log(df['rep_count'] + df['dem_count'] + 1)
df['log_total_count'] = np.log(df['total_count'] + 1)
df['log_num_tweets'] = np.log(df['num_tweets'] + 1)
df['norm_emot_50'] = (df['emot_50_mean'] - df['emot_50_mean'].mean()) / df['emot_50_mean'].std()
df['norm_emot_100'] = (df['emot_100_mean'] - df['emot_100_mean'].mean()) / df['emot_100_mean'].std()
df['norm_emot_150'] = (df['emot_150_mean'] - df['emot_150_mean'].mean()) / df['emot_150_mean'].std()
df['norm_emot_200'] = (df['emot_200_mean'] - df['emot_200_mean'].mean()) / df['emot_200_mean'].std()

df['punish_law_log'] = np.log(df['punish_law'] + 1)
df['racism_log'] = np.log(df['racism'] + 1)
df['covid_log'] = np.log(df['covid'] + 1)
df['trump_log'] = np.log(df['trump']+1)
df['congress_bills_log'] = np.log(df['congress_bills']+1)
df['right_outrage_log'] = np.log(df['right_outrage']+1)
df['primaries_log'] = np.log(df['primaries']+1)
df['vp_nomination_log'] = np.log(df['vp_nomination']+1)
df['russia_log'] = np.log(df['russia']+1)
df['fed_investigation_log'] = np.log(df['fed_investigation']+1)
df['arguments_log'] = np.log(df['arguments']+1)
df['social_media_log'] = np.log(df['social_media']+1)
df['healthcare_log'] = np.log(df['healthcare'] + 1)
df['senate_race_log'] = np.log(df['senate_race'] + 1)
df['vote_tomorrow_log'] = np.log(df['vote_tomorrow'] + 1)
df['protest_log'] = np.log(df['protest'] + 1)
df['scotus_log'] = np.log(df['scotus'] + 1)
df['institutions_log'] = np.log(df['institutions'] + 1)
df['climate_log'] = np.log(df['climate'] + 1)
df['economy_log'] = np.log(df['economy'] + 1)
df['vote_methods_log'] = np.log(df['vote_methods'] + 1)
df['progress_fight_log'] = np.log(df['progress_fight'] + 1)
df['covid_restrictions_log'] = np.log(df['covid_restrictions'] + 1)
df['trump_2_log'] = np.log(df['trump_2'] + 1)
df['news_media_log'] = np.log(df['news_media'] + 1)

total original data points: 181770


In [3]:
print(len(df))

83432


In [5]:
#get rep change variable
df.sort_values(['user_id', 'period'], inplace=True)
df['rep_change'] = df.groupby(['user_id', 'school'])['rep_count'].diff()

#get dem change variable
df.sort_values(['user_id', 'period'], inplace=True)
df['dem_change'] = df.groupby(['user_id', 'school'])['dem_count'].diff()

In [6]:
#individuals that have at least one change
new = df.loc[(df['rep_change'] != 0) | (df['dem_change'] != 0) | (df['num_tweets'] != 0)]

#at least one tweet
new2 = df = df.loc[df['emot_100_mean'] != 0]

print(len(new))
print(len(new2))

57283
54935


In [7]:
new2.to_csv('/home/twalton_umass_edu/Political Polarization Project/dem_20210428.csv')

In [6]:
#get base models for all 7 schools
for s in schools:
    school = new[new['school'] == s]
    
    school = school.set_index(['user_id', 'period'])
    
    #rep model
    mod = PanelOLS.from_formula('norm_emot_100 ~ + log_rep_count + log_dem_count + TimeEffects + EntityEffects', school)
    res = mod.fit(cov_type = 'clustered', cluster_entity = True)
    print('####### ' + s + ' ########')
    print(res)

####### s1 ########
                          PanelOLS Estimation Summary                           
Dep. Variable:          norm_emot_100   R-squared:                        0.0004
Estimator:                   PanelOLS   R-squared (Between):              0.0134
No. Observations:                9779   R-squared (Within):               0.0005
Date:                Wed, Apr 28 2021   R-squared (Overall):              0.0022
Time:                        13:23:29   Log-likelihood                -1.131e+04
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      1.9432
Entities:                         443   P-value                           0.1433
Avg Obs:                       22.074   Distribution:                  F(2,9304)
Min Obs:                       2.0000                                           
Max Obs:                       31.000   F-statistic (robust):             2.1311
        

In [7]:
#get base models for all 7 schools
for s in schools:
    school = new[new['school'] == s]
    
    school = school.set_index(['user_id', 'period'])
    
    #rep model
    mod = PanelOLS.from_formula('norm_emot_100 ~ + log_rep_count + log_dem_count + punish_law_log + racism_log + covid_log + trump_log + congress_bills_log + right_outrage_log + primaries_log + vp_nomination_log + russia_log + fed_investigation_log + arguments_log + social_media_log + healthcare_log + senate_race_log + vote_tomorrow_log + protest_log + scotus_log + institutions_log + climate_log + economy_log + vote_methods_log + progress_fight_log + covid_restrictions_log + trump_2_log + news_media_log + vp_nomination_log + social_media_log + senate_race_log + progress_fight_log + TimeEffects + EntityEffects', school)
    res = mod.fit(cov_type = 'clustered', cluster_entity = True)
    print('####### ' + s + ' ########')
    print(res)

####### s1 ########
                          PanelOLS Estimation Summary                           
Dep. Variable:          norm_emot_100   R-squared:                        0.0367
Estimator:                   PanelOLS   R-squared (Between):              0.1288
No. Observations:                9779   R-squared (Within):               0.0473
Date:                Wed, Apr 28 2021   R-squared (Overall):              0.1068
Time:                        13:23:43   Log-likelihood                -1.113e+04
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      13.109
Entities:                         443   P-value                           0.0000
Avg Obs:                       22.074   Distribution:                 F(27,9279)
Min Obs:                       2.0000                                           
Max Obs:                       31.000   F-statistic (robust):             12.784
        

In [32]:
#read in most recent final product
df = pd.read_csv('/home/twalton_umass_edu/Political Polarization Project/final/final_product_20210407.csv')

#list of schools to loop through
schools = ['s1', 's2', 's3', 's4', 's5', 's6', 's8']

#only want republicans
df = df[df['dem_org'] == 1]

#number of data points
print('total original data points: ' + str(len(df)))
      
#get rid of first three period due to missing ties
df = df[(df['period'] >= 4)]

df = df.groupby(['user_id', 'school']).filter(lambda x: len(x) > 25)
    
#fill in nans with 0
df = df.fillna(0)
      
#create new variable that is the variance of the rep count for each user_id/school pair
df = df.join(df.groupby(['user_id', 'school'])['rep_count'].var(), on = ['user_id', 'school'], rsuffix = '_var')

#create new variable that is the variance of the dem_coutn for each user_id/school pair
df = df.join(df.groupby(['user_id', 'school'])['dem_count'].var(), on = ['user_id', 'school'], rsuffix = '_var')

#create a new variable that is the total number of tweets for the individual
df = df.join(df.groupby(['user_id', 'school'])['num_tweets'].sum(), on = ['user_id', 'school'], rsuffix = '_sum')

#keep usee that have at least both variance or at least one tweet
df = df[(df['num_tweets_sum'] != 0) | ((df['dem_count_var'] != 0) & (df['rep_count_var'] != 0))]

      
topics = {"X0": "education", "X1": "names", "X2": "punish_law", "X3": "racism",
                       "X4": "covid", "X5": "event", "X6": "trump", "X7": "verbs", "X8": "congress_bills",
                       "X9": "spam", "X10": "right_outrage", "X11": "primaries", "X12": "vp_nomination",
                       "X13": "russia", "X14": "hero_RBG_BLM_Lewis", "X15": "arguments", "X16": "fed_investigation",
                       "X17": "social_media", "X18": "healthcare", "X19": "senate_race", "X20": "contractions",
                       "X21": "vote_tomorrow", "X22": "mixture", "X23": "protest", "X24": "scotus",
                       "X25": "institutions", "X26": "climate", "X27": "economy", "X28": "vote_methods",
                       "X29": "knew_had", "X30": "family", "X31": "date_time", "X32": "grateful",
                       "X33": "slang", "X34": "celebrate", "X35": "progress_fight", "X36": "geographic_location",
                       "X37": "food", "X38": "slang_2", "X39": "numbers", "X40": "covid_restrictions", "X41": "trump_2",
                       "X42": "holiday", "X43": "sport", "X44": "news_media", "X45": "no_topic"}
      
#rename topic variables
df = df.rename(columns = topics)

df['log_dem_count'] = np.log(df['dem_count'] + 1)
df['log_rep_count'] = np.log(df['rep_count'] + 1)
df['log_tot_pol'] = np.log(df['rep_count'] + df['dem_count'] + 1)
df['log_total_count'] = np.log(df['total_count'] + 1)
df['log_num_tweets'] = np.log(df['num_tweets'] + 1)
df['norm_emot_50'] = (df['emot_50_mean'] - df['emot_50_mean'].mean()) / df['emot_50_mean'].std()
df['norm_emot_100'] = (df['emot_100_mean'] - df['emot_100_mean'].mean()) / df['emot_100_mean'].std()
df['norm_emot_150'] = (df['emot_150_mean'] - df['emot_150_mean'].mean()) / df['emot_150_mean'].std()
df['norm_emot_200'] = (df['emot_200_mean'] - df['emot_200_mean'].mean()) / df['emot_200_mean'].std()

df['punish_law_log'] = np.log(df['punish_law'] + 1)
df['racism_log'] = np.log(df['racism'] + 1)
df['covid_log'] = np.log(df['covid'] + 1)
df['trump_log'] = np.log(df['trump']+1)
df['congress_bills_log'] = np.log(df['congress_bills']+1)
df['right_outrage_log'] = np.log(df['right_outrage']+1)
df['primaries_log'] = np.log(df['primaries']+1)
df['vp_nomination_log'] = np.log(df['vp_nomination']+1)
df['russia_log'] = np.log(df['russia']+1)
df['fed_investigation_log'] = np.log(df['fed_investigation']+1)
df['arguments_log'] = np.log(df['arguments']+1)
df['social_media_log'] = np.log(df['social_media']+1)
df['healthcare_log'] = np.log(df['healthcare'] + 1)
df['senate_race_log'] = np.log(df['senate_race'] + 1)
df['vote_tomorrow_log'] = np.log(df['vote_tomorrow'] + 1)
df['protest_log'] = np.log(df['protest'] + 1)
df['scotus_log'] = np.log(df['scotus'] + 1)
df['institutions_log'] = np.log(df['institutions'] + 1)
df['climate_log'] = np.log(df['climate'] + 1)
df['economy_log'] = np.log(df['economy'] + 1)
df['vote_methods_log'] = np.log(df['vote_methods'] + 1)
df['progress_fight_log'] = np.log(df['progress_fight'] + 1)
df['covid_restrictions_log'] = np.log(df['covid_restrictions'] + 1)
df['trump_2_log'] = np.log(df['trump_2'] + 1)
df['news_media_log'] = np.log(df['news_media'] + 1)

total original data points: 181770


In [33]:
#get rep change variable
df.sort_values(['user_id', 'period'], inplace=True)
df['rep_change'] = df.groupby(['user_id', 'school'])['rep_count'].diff()

#get dem change variable
df.sort_values(['user_id', 'period'], inplace=True)
df['dem_change'] = df.groupby(['user_id', 'school'])['dem_count'].diff()

#individuals that have at least one change
new = df.loc[(df['rep_change'] != 0) | (df['dem_change'] != 0) | (df['num_tweets'] != 0)]

#at least one tweet
new2 = df = df.loc[df['emot_100_mean'] != 0]

print(len(new))
print(len(new2))

57283
54935


In [34]:
#get base models for all 7 schools
for s in schools:
    school = new[new['school'] == s]
    
    school = school.set_index(['user_id', 'period'])
    
    #rep model
    mod = PanelOLS.from_formula('emot_100_mean ~ + log_rep_count + log_dem_count + TimeEffects + EntityEffects', school)
    res = mod.fit(cov_type = 'clustered', cluster_entity = True)
    print('####### ' + s + ' ########')
    print(res)

####### s1 ########
                          PanelOLS Estimation Summary                           
Dep. Variable:          emot_100_mean   R-squared:                        0.0004
Estimator:                   PanelOLS   R-squared (Between):              0.4051
No. Observations:                9779   R-squared (Within):               0.0005
Date:                Fri, Apr 16 2021   R-squared (Overall):              0.3046
Time:                        18:33:54   Log-likelihood                 1.007e+04
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      1.9432
Entities:                         443   P-value                           0.1433
Avg Obs:                       22.074   Distribution:                  F(2,9304)
Min Obs:                       2.0000                                           
Max Obs:                       31.000   F-statistic (robust):             2.1311
        

In [35]:
#get base models for all 7 schools
for s in schools:
    school = new[new['school'] == s]
    
    school = school.set_index(['user_id', 'period'])
    
    #rep model
    mod = PanelOLS.from_formula('emot_100_mean ~ + log_rep_count + log_dem_count + punish_law_log + racism_log + covid_log + trump_log + congress_bills_log + right_outrage_log + primaries_log + vp_nomination_log + russia_log + fed_investigation_log + arguments_log + social_media_log + healthcare_log + senate_race_log + vote_tomorrow_log + protest_log + scotus_log + institutions_log + climate_log + economy_log + vote_methods_log + progress_fight_log + covid_restrictions_log + trump_2_log + news_media_log + vp_nomination_log + social_media_log + senate_race_log + progress_fight_log + TimeEffects + EntityEffects', school)
    res = mod.fit(cov_type = 'clustered', cluster_entity = True)
    print('####### ' + s + ' ########')
    print(res)

####### s1 ########
                          PanelOLS Estimation Summary                           
Dep. Variable:          emot_100_mean   R-squared:                        0.0478
Estimator:                   PanelOLS   R-squared (Between):              0.4559
No. Observations:                9779   R-squared (Within):               0.0608
Date:                Fri, Apr 16 2021   R-squared (Overall):              0.3633
Time:                        18:36:01   Log-likelihood                  1.03e+04
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      17.255
Entities:                         443   P-value                           0.0000
Avg Obs:                       22.074   Distribution:                 F(27,9279)
Min Obs:                       2.0000                                           
Max Obs:                       31.000   F-statistic (robust):             13.734
        