In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from linearmodels import PanelOLS
from sklearn.decomposition import PCA

In [None]:
#read in most recent final product
df = pd.read_csv('/home/twalton_umass_edu/Political Polarization Project/final/final_product_topic_45_20210427.csv')

#list of schools to loop through
schools = ['s1', 's2', 's3', 's4', 's5', 's6', 's8']

#only want republicans
df = df[df['rep_org'] == 1]

#number of data points
print('total original data points: ' + str(len(df)))
      
#get rid of first three period due to missing ties
df = df[(df['period'] >= 4)]

df = df.groupby(['user_id', 'school']).filter(lambda x: len(x) > 25)
    
#fill in nans with 0
df = df.fillna(0)
      
#create new variable that is the variance of the rep count for each user_id/school pair
df = df.join(df.groupby(['user_id', 'school'])['rep_count'].var(), on = ['user_id', 'school'], rsuffix = '_var')

#create new variable that is the variance of the dem_coutn for each user_id/school pair
df = df.join(df.groupby(['user_id', 'school'])['dem_count'].var(), on = ['user_id', 'school'], rsuffix = '_var')

#create a new variable that is the total number of tweets for the individual
df = df.join(df.groupby(['user_id', 'school'])['num_tweets'].sum(), on = ['user_id', 'school'], rsuffix = '_sum')

#keep usee that have at least both variance or at least one tweet
df = df[(df['num_tweets_sum'] != 0) | ((df['dem_count_var'] != 0) & (df['rep_count_var'] != 0))]

      
topics = {'X0': 'news_media', 'X1': 'location', 'X2': 'numerics', 'X3': 'contractions', 'X4': 'names'
         ,'X5': 'religion', 'X6': 'derogatory_media', 'X7': 'covid_restrictions', 'X8': 'innovate'
         ,'X9': 'republicans', 'X10': 'economics', 'X11': 'family', 'X12': 'police_violence', 'X13': 'date_time'
         ,'X14': 'argument', 'X15': 'fed_investigation', 'X16': 'spam', 'X17': 'covid', 'X18': 'misc'
         ,'X19': 'law_constituion', 'X20': 'riot', 'X21': 'scotus', 'X22': 'outdoors', 'X23': 'sports'
         ,'X24': 'racism', 'X25': 'voting', 'X26': 'ideology', 'X27': 'movement', 'X28': 'social_media'
         ,'X29': 'democracy_future', 'X30': 'holiday', 'X31': 'possiblity', 'X32': 'awesom', 'X33': 'prayer'
         ,'X34': 'social_media_derogatory', 'X35': 'foreign_enemies', 'X36': 'small_talk', 'X37': 'election'
         ,'X38': 'calendar_day', 'X39': 'globalism', 'X40': 'food', 'X41': 'random _objects', 'X42': 'biden_campaign'
         ,'X43': 'yeah', 'X44': 'sexual_violence', 'X45': 'no_topic'}
      
#rename topic variables
df = df.rename(columns = topics)

df['log_dem_count'] = np.log(df['dem_count'] + 1)
df['log_rep_count'] = np.log(df['rep_count'] + 1)
df['log_tot_pol'] = np.log(df['rep_count'] + df['dem_count'] + 1)
df['log_total_count'] = np.log(df['total_count'] + 1)
df['log_num_tweets'] = np.log(df['num_tweets'] + 1)
df['norm_emot_50'] = (df['emot_50_mean'] - df['emot_50_mean'].mean()) / df['emot_50_mean'].std()
df['norm_emot_100'] = (df['emot_100_mean'] - df['emot_100_mean'].mean()) / df['emot_100_mean'].std()
df['norm_emot_150'] = (df['emot_150_mean'] - df['emot_150_mean'].mean()) / df['emot_150_mean'].std()
df['norm_emot_200'] = (df['emot_200_mean'] - df['emot_200_mean'].mean()) / df['emot_200_mean'].std()

df['news_media_log'] = np.log(df['news_media'] + 1)
df['covid_restrictions_log'] = np.log(df['covid_restrictions'] + 1)
df['republicans_log'] = np.log(df['republicans']+1)
df['economics_log'] = np.log(df['economics']+1)
df['police_violence_log'] = np.log(df['police_violence']+1)
df['argument_log'] = np.log(df['argument']+1)
df['fed_investigation_log'] = np.log(df['fed_investigation']+1)
df['law_constitution_log'] = np.log(df['law_constituion']+1)
df['racism_log'] = np.log(df['racism']+1)
df['voting_log'] = np.log(df['voting']+1)
df['ideology_log'] = np.log(df['ideology']+1)
df['social_media_log'] = np.log(df['social_media'] + 1)
df['democracy_future_log'] = np.log(df['democracy_future'] + 1)
df['riot_log'] = np.log(df['riot'] + 1)
df['covid_log'] = np.log(df['covid'] + 1)
df['scotus_log'] = np.log(df['scotus'] + 1)
df['police_violence_log'] = np.log(df['police_violence'] + 1)
df['derogatory_media_log'] = np.log(df['derogatory_media'] + 1)
df['social_media_derogatory_log'] = np.log(df['social_media_derogatory'] + 1)
df['foreign_enemies_log'] = np.log(df['foreign_enemies'] + 1)
df['election_log'] = np.log(df['election'] + 1)
df['globalism_log'] = np.log(df['globalism'] + 1)
df['biden_campaign_log'] = np.log(df['biden_campaign'] + 1)
df['innovate_log'] = np.log(df['innovate'] + 1)
df['news_media_log'] = np.log(df['news_media'] + 1)
df['numbers_log'] = np.log(df['numbers'] + 1)
df['names_log'] = np.log(df['names'] + 1)
df['location_log'] = np.log(df['location'] + 1)
df['argument_log'] = np.log(df['argument'] + 1)
df['awesome_log'] = np.log(df['awesome'] + 1)
df['sexual_violence_log'] = np.log(df['sexual_violence'] + 1)

In [3]:
print(len(df))

111360


In [3]:
#get rep change variable
df.sort_values(['user_id', 'period'], inplace=True)
df['rep_change'] = df.groupby(['user_id', 'school'])['rep_count'].diff()

#get dem change variable
df.sort_values(['user_id', 'period'], inplace=True)
df['dem_change'] = df.groupby(['user_id', 'school'])['dem_count'].diff()

In [5]:
#individuals that have at least one change
new = df.loc[(df['rep_change'] != 0) | (df['dem_change'] != 0) | (df['num_tweets'] != 0)]

In [4]:
#at least one tweet
new2 = df.loc[df['emot_100_mean'] != 0]

In [7]:
print(len(new))
print(len(new2))

80355
74420


In [5]:
new2.to_csv('/home/twalton_umass_edu/Political Polarization Project/rep_20210428.csv')

In [8]:
#get base models for all 7 schools
for s in schools:
    school = new2[new2['school'] == s]
    
    school = school.set_index(['user_id', 'period'])
    
    #rep model
    mod = PanelOLS.from_formula('norm_emot_100 ~ + log_rep_count + log_dem_count + TimeEffects + EntityEffects', school)
    res = mod.fit(cov_type = 'clustered', cluster_entity = True)
    print('####### ' + s + ' ########')
    print(res)

####### s1 ########
                          PanelOLS Estimation Summary                           
Dep. Variable:          norm_emot_100   R-squared:                        0.0024
Estimator:                   PanelOLS   R-squared (Between):             -1.6611
No. Observations:                4763   R-squared (Within):               0.0012
Date:                Wed, Apr 28 2021   R-squared (Overall):             -1.3368
Time:                        13:20:57   Log-likelihood                   -6526.2
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      5.5170
Entities:                         233   P-value                           0.0040
Avg Obs:                       20.442   Distribution:                  F(2,4498)
Min Obs:                       1.0000                                           
Max Obs:                       31.000   F-statistic (robust):             2.9323
        

In [9]:
#get base models for all 7 schools
for s in schools:
    school = new2[new2['school'] == s]
    
    school = school.set_index(['user_id', 'period'])
    
    #rep model
    mod = PanelOLS.from_formula('norm_emot_100 ~ + log_rep_count + log_dem_count + covid_restrictions_log + economics_log + argument_log + fed_investigation_log + law_constitution_log + racism_log + voting_log + ideology_log + riot_log + covid_log + police_violence_log + derogatory_media_log + foreign_enemies_log + globalism_log + TimeEffects + EntityEffects', school)
    res = mod.fit(cov_type = 'clustered', cluster_entity = True)
    print('####### ' + s + ' ########')
    print(res)

####### s1 ########
                          PanelOLS Estimation Summary                           
Dep. Variable:          norm_emot_100   R-squared:                        0.0194
Estimator:                   PanelOLS   R-squared (Between):             -1.7551
No. Observations:                4763   R-squared (Within):               0.0222
Date:                Wed, Apr 28 2021   R-squared (Overall):             -1.4452
Time:                        13:21:49   Log-likelihood                   -6485.4
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      5.5470
Entities:                         233   P-value                           0.0000
Avg Obs:                       20.442   Distribution:                 F(16,4484)
Min Obs:                       1.0000                                           
Max Obs:                       31.000   F-statistic (robust):             5.3895
        

In [10]:
#read in most recent final product
df = pd.read_csv('/home/twalton_umass_edu/Political Polarization Project/final/final_product_20210402.csv')

#list of schools to loop through
schools = ['s1', 's2', 's3', 's4', 's5', 's6', 's8']

#only want republicans
df = df[df['rep_org'] == 1]

#number of data points
print('total original data points: ' + str(len(df)))
      
#get rid of first three period due to missing ties
df = df[(df['period'] >= 4)]

df = df.groupby(['user_id', 'school']).filter(lambda x: len(x) > 25)
    
#fill in nans with 0
df = df.fillna(0)
      
#create new variable that is the variance of the rep count for each user_id/school pair
df = df.join(df.groupby(['user_id', 'school'])['rep_count'].var(), on = ['user_id', 'school'], rsuffix = '_var')

#create new variable that is the variance of the dem_coutn for each user_id/school pair
df = df.join(df.groupby(['user_id', 'school'])['dem_count'].var(), on = ['user_id', 'school'], rsuffix = '_var')

#create a new variable that is the total number of tweets for the individual
df = df.join(df.groupby(['user_id', 'school'])['num_tweets'].sum(), on = ['user_id', 'school'], rsuffix = '_sum')

#keep usee that have at least both variance or at least one tweet
df = df[(df['num_tweets_sum'] != 0) | ((df['dem_count_var'] != 0) & (df['rep_count_var'] != 0))]

      
topics = {'X0': 'news_media', 'X1': 'location', 'X2': 'numerics', 'X3': 'contractions', 'X4': 'names'
         ,'X5': 'religion', 'X6': 'derogatory_media', 'X7': 'covid_restrictions', 'X8': 'innovate'
         ,'X9': 'republicans', 'X10': 'economics', 'X11': 'family', 'X12': 'police_violence', 'X13': 'date_time'
         ,'X14': 'argument', 'X15': 'fed_investigation', 'X16': 'spam', 'X17': 'covid', 'X18': 'misc'
         ,'X19': 'law_constituion', 'X20': 'riot', 'X21': 'scotus', 'X22': 'outdoors', 'X23': 'sports'
         ,'X24': 'racism', 'X25': 'voting', 'X26': 'ideology', 'X27': 'movement', 'X28': 'social_media'
         ,'X29': 'democracy_future', 'X30': 'holiday', 'X31': 'possiblity', 'X32': 'awesom', 'X33': 'prayer'
         ,'X34': 'social_media_derogatory', 'X35': 'foreign_enemies', 'X36': 'small_talk', 'X37': 'election'
         ,'X38': 'calendar_day', 'X39': 'globalism', 'X40': 'food', 'X41': 'random _objects', 'X42': 'biden_campaign'
         ,'X43': 'yeah', 'X44': 'sexual_violence', 'X45': 'no_topic'}
      
#rename topic variables
df = df.rename(columns = topics)

df['log_dem_count'] = np.log(df['dem_count'] + 1)
df['log_rep_count'] = np.log(df['rep_count'] + 1)
df['log_tot_pol'] = np.log(df['rep_count'] + df['dem_count'] + 1)
df['log_total_count'] = np.log(df['total_count'] + 1)
df['log_num_tweets'] = np.log(df['num_tweets'] + 1)
df['norm_emot_50'] = (df['emot_50_mean'] - df['emot_50_mean'].mean()) / df['emot_50_mean'].std()
df['norm_emot_100'] = (df['emot_100_mean'] - df['emot_100_mean'].mean()) / df['emot_100_mean'].std()
df['norm_emot_150'] = (df['emot_150_mean'] - df['emot_150_mean'].mean()) / df['emot_150_mean'].std()
df['norm_emot_200'] = (df['emot_200_mean'] - df['emot_200_mean'].mean()) / df['emot_200_mean'].std()

df['news_media_log'] = np.log(df['news_media'] + 1)
df['covid_restrictions_log'] = np.log(df['covid_restrictions'] + 1)
df['republicnas_log'] = np.log(df['republicans']+1)
df['economics_log'] = np.log(df['economics']+1)
df['police_violence_log'] = np.log(df['police_violence']+1)
df['argument_log'] = np.log(df['argument']+1)
df['fed_investigation_log'] = np.log(df['fed_investigation']+1)
df['law_constitution_log'] = np.log(df['law_constituion']+1)
df['racism_log'] = np.log(df['racism']+1)
df['voting_log'] = np.log(df['voting']+1)
df['ideology_log'] = np.log(df['ideology']+1)
df['social_media_log'] = np.log(df['social_media'] + 1)
df['democracy_future_log'] = np.log(df['democracy_future'] + 1)
df['riot_log'] = np.log(df['riot'] + 1)
df['covid_log'] = np.log(df['covid'] + 1)
df['scotus_log'] = np.log(df['scotus'] + 1)
df['police_violence_log'] = np.log(df['police_violence'] + 1)
df['derogatory_media_log'] = np.log(df['derogatory_media'] + 1)
df['social_media_derogatory_log'] = np.log(df['social_media_derogatory'] + 1)
df['foreign_enemies_log'] = np.log(df['foreign_enemies'] + 1)
df['election_log'] = np.log(df['election'] + 1)
df['globalism_log'] = np.log(df['globalism'] + 1)
df['biden_campaign_log'] = np.log(df['biden_campaign'] + 1)

total original data points: 224969


In [74]:
#get rep change variable
df.sort_values(['user_id', 'period'], inplace=True)
df['rep_change'] = df.groupby(['user_id', 'school'])['rep_count'].diff()

#get dem change variable
df.sort_values(['user_id', 'period'], inplace=True)
df['dem_change'] = df.groupby(['user_id', 'school'])['dem_count'].diff()

#individuals that have at least one change
new = df.loc[(df['rep_change'] != 0) | (df['dem_change'] != 0) | (df['num_tweets'] != 0)]

#at least one tweet
new2 = df = df.loc[df['emot_100_mean'] != 0]

print(len(new))

80355


In [77]:
#get base models for all 7 schools
for s in schools:
    school = new[new['school'] == s]
    
    school = school.set_index(['user_id', 'period'])
    
    #rep model
    mod = PanelOLS.from_formula('emot_100_mean ~ + log_rep_count + log_dem_count + TimeEffects + EntityEffects', school)
    res = mod.fit(cov_type = 'clustered', cluster_entity = True)
    print('####### ' + s + ' ########')
    print(res)

####### s1 ########
                          PanelOLS Estimation Summary                           
Dep. Variable:          emot_100_mean   R-squared:                        0.0025
Estimator:                   PanelOLS   R-squared (Between):             -2.1113
No. Observations:                5108   R-squared (Within):               0.0011
Date:                Fri, Apr 16 2021   R-squared (Overall):             -1.5147
Time:                        18:44:19   Log-likelihood                    4868.7
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      6.0386
Entities:                         247   P-value                           0.0024
Avg Obs:                       20.680   Distribution:                  F(2,4829)
Min Obs:                       2.0000                                           
Max Obs:                       31.000   F-statistic (robust):             4.0692
        

In [78]:
#get base models for all 7 schools
for s in schools:
    school = new[new['school'] == s]
    
    school = school.set_index(['user_id', 'period'])
    
    #rep model
    mod = PanelOLS.from_formula('emot_100_mean ~ + log_rep_count + log_dem_count + covid_restrictions_log + economics_log + police_violence_log + argument_log + fed_investigation_log + law_constitution_log + racism_log + voting_log + ideology_log + riot_log + covid_log + police_violence_log + derogatory_media_log + foreign_enemies_log + globalism_log + TimeEffects + EntityEffects', school)
    res = mod.fit(cov_type = 'clustered', cluster_entity = True)
    print('####### ' + s + ' ########')
    print(res)

####### s1 ########
                          PanelOLS Estimation Summary                           
Dep. Variable:          emot_100_mean   R-squared:                        0.0137
Estimator:                   PanelOLS   R-squared (Between):             -2.1885
No. Observations:                5108   R-squared (Within):               0.0147
Date:                Fri, Apr 16 2021   R-squared (Overall):             -1.5970
Time:                        18:44:56   Log-likelihood                    4897.5
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      4.1748
Entities:                         247   P-value                           0.0000
Avg Obs:                       20.680   Distribution:                 F(16,4815)
Min Obs:                       2.0000                                           
Max Obs:                       31.000   F-statistic (robust):             4.0166
        