In [1]:
# Loading packages

import pandas as pd
import numpy as np
import statsmodels.api as sm

In [2]:
# Load the DataFrame from the pickle file
df = pd.read_pickle('documents_topics.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57790 entries, 0 to 57789
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      57790 non-null  object 
 1   text                          57790 non-null  object 
 2   created_at                    57790 non-null  object 
 3   campaign_week                 57790 non-null  int64  
 4   process_text_check            57790 non-null  object 
 5   name                          57790 non-null  object 
 6   handle                        57790 non-null  object 
 7   party                         57790 non-null  object 
 8   state_code                    57790 non-null  object 
 9   state_name                    57790 non-null  object 
 10  result_pctg                   57790 non-null  float64
 11  result_votes                  57790 non-null  int64  
 12  position                      57790 non-null  int64  
 13  t

## Test for H1a and H1b

In [3]:
df_h1a_h1b = df

def is_empty_or_nan(x):
    if x != x or not x:
        return True
    return False

# Create 'local' and 'national' columns
df_h1a_h1b['local'] = df_h1a_h1b['check_city_filtered'].apply(lambda x: 1 if not is_empty_or_nan(x) else 0)
df_h1a_h1b['national'] = df_h1a_h1b['check_city_filtered'].apply(lambda x: 1 if is_empty_or_nan(x) else 0)

In [4]:
df_h1a_h1b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57790 entries, 0 to 57789
Data columns (total 31 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      57790 non-null  object 
 1   text                          57790 non-null  object 
 2   created_at                    57790 non-null  object 
 3   campaign_week                 57790 non-null  int64  
 4   process_text_check            57790 non-null  object 
 5   name                          57790 non-null  object 
 6   handle                        57790 non-null  object 
 7   party                         57790 non-null  object 
 8   state_code                    57790 non-null  object 
 9   state_name                    57790 non-null  object 
 10  result_pctg                   57790 non-null  float64
 11  result_votes                  57790 non-null  int64  
 12  position                      57790 non-null  int64  
 13  t

In [5]:
#remove unnecessary columns for a linear regression and statistical tests
df_h1a_h1b = df_h1a_h1b.drop(columns=['tweet_id','text','created_at','campaign_week','process_text_check','handle','party','state_code','name','total_votes_casted','check_city_filtered','PreProcessedText_WithoutCity','Words','WordsCleaned','WordsLemmatized','Campaign Event','Social Protection','General Discussion','Abortion','Homeland Security','Candidate Denigration','Vote Instigation','Campaign Contribution','Economy Policies','result_votes','result_pctg'], axis=1)
df_h1a_h1b.head(10)

Unnamed: 0,state_name,position,winner_margin_for_runners_up,local,national
0,New Hampshire,2,0.090608,1,0
1,New Hampshire,2,0.090608,1,0
2,New Hampshire,2,0.090608,1,0
3,New Hampshire,2,0.090608,0,1
4,New Hampshire,2,0.090608,0,1
5,New Hampshire,2,0.090608,0,1
6,New Hampshire,2,0.090608,1,0
7,New Hampshire,2,0.090608,0,1
8,New Hampshire,2,0.090608,1,0
9,New Hampshire,2,0.090608,1,0


In [6]:
#aggregate by state_name (race)
#create a dataframe with the values for the algorithms
#transform position into a categorical variable

agg_dict = {
    #aggregate the numerical values
    'local':'sum',
    'national':'sum',
    #these values are equal for all the tweets from the same race
    'winner_margin_for_runners_up':'mean',
    #now the categorical values
}


#create the new dataframe
df_h1a_h1b = df_h1a_h1b.groupby(['state_name']).agg(agg_dict).reset_index()
df_h1a_h1b.head(50)


Unnamed: 0,state_name,local,national,winner_margin_for_runners_up
0,Alabama,459,457,0.358326
1,Alaska,572,400,0.07146
2,Arizona,400,924,0.048874
3,Arkansas,390,1113,0.348388
4,California,337,1244,0.182714
5,Colorado,541,924,0.146179
6,Connecticut,113,297,0.157094
7,Florida,718,1610,0.16413
8,Georgia,662,854,0.028061
9,Hawaii,1,4,0.521716


In [7]:
df_h1a_h1b.head(50)

Unnamed: 0,state_name,local,national,winner_margin_for_runners_up
0,Alabama,459,457,0.358326
1,Alaska,572,400,0.07146
2,Arizona,400,924,0.048874
3,Arkansas,390,1113,0.348388
4,California,337,1244,0.182714
5,Colorado,541,924,0.146179
6,Connecticut,113,297,0.157094
7,Florida,718,1610,0.16413
8,Georgia,662,854,0.028061
9,Hawaii,1,4,0.521716


In [8]:
#check for the proportion of tweets and normalize values for linear regression
df_h1a_h1b['national_proportion'] = df_h1a_h1b['national'] / (df_h1a_h1b['local'] + df_h1a_h1b['national'])
df_h1a_h1b['local_proportion'] = df_h1a_h1b['local'] / (df_h1a_h1b['national'] + df_h1a_h1b['local'])

#drop local and national columns
race = df_h1a_h1b.drop(columns=['local','national','state_name'])
race.head(10)

Unnamed: 0,winner_margin_for_runners_up,national_proportion,local_proportion
0,0.358326,0.498908,0.501092
1,0.07146,0.411523,0.588477
2,0.048874,0.697885,0.302115
3,0.348388,0.740519,0.259481
4,0.182714,0.786844,0.213156
5,0.146179,0.630717,0.369283
6,0.157094,0.72439,0.27561
7,0.16413,0.691581,0.308419
8,0.028061,0.563325,0.436675
9,0.521716,0.8,0.2


In [9]:
# Normalize the variables
race['normalized_national_proportion'] = (race['national_proportion'] - race['national_proportion'].mean()) / race['national_proportion'].std()
race['normalized_local_proportion'] = (race['local_proportion'] - race['local_proportion'].mean()) / race['local_proportion'].std()


#drop non-normalized values
race = race.drop(columns=['local_proportion','national_proportion'],axis=1)
race.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 3 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   winner_margin_for_runners_up    35 non-null     float64
 1   normalized_national_proportion  35 non-null     float64
 2   normalized_local_proportion     35 non-null     float64
dtypes: float64(3)
memory usage: 968.0 bytes


In [10]:
# Define a function to perform linear regression and print the results
def perform_linear_regression(data, dependent_var, independent_vars):
    Y = data[dependent_var]
    X = data[independent_vars]
    X = sm.add_constant(X)
    model = sm.OLS(Y, X).fit()
    print(model.summary())


# Perform linear regression with only winners using normalized variables for national_proportion
print("Race with Normalized Variables - National Proportion:")
perform_linear_regression(race, 'winner_margin_for_runners_up', ['normalized_national_proportion'])

# Perform linear regression with only winners using normalized variables for local_proportion
print("Race with Normalized Variables - Local Proportion:")
perform_linear_regression(race, 'winner_margin_for_runners_up', ['normalized_local_proportion'])


Race with Normalized Variables - National Proportion:
                                 OLS Regression Results                                 
Dep. Variable:     winner_margin_for_runners_up   R-squared:                       0.120
Model:                                      OLS   Adj. R-squared:                  0.093
Method:                           Least Squares   F-statistic:                     4.499
Date:                          Tue, 09 May 2023   Prob (F-statistic):             0.0415
Time:                                  21:44:49   Log-Likelihood:                 22.498
No. Observations:                            35   AIC:                            -41.00
Df Residuals:                                33   BIC:                            -37.89
Df Model:                                     1                                         
Covariance Type:                      nonrobust                                         
                                     coef    std err    

  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)





## Results




In [11]:
# Add a constant to the DataFrame for the regression intercept
race['const'] = 1

# Define the regression formula for local tweets
local_model = sm.OLS(race['normalized_local_proportion'], race[['const', 'winner_margin_for_runners_up']])

# Fit the model
local_results = local_model.fit()

# Print out the results
print(local_results.summary())

# Define the regression formula for national tweets
national_model = sm.OLS(race['normalized_national_proportion'], race[['const', 'winner_margin_for_runners_up']])

# Fit the model
national_results = national_model.fit()

# Print out the results
print(national_results.summary())

                                 OLS Regression Results                                
Dep. Variable:     normalized_local_proportion   R-squared:                       0.120
Model:                                     OLS   Adj. R-squared:                  0.093
Method:                          Least Squares   F-statistic:                     4.499
Date:                         Tue, 09 May 2023   Prob (F-statistic):             0.0415
Time:                                 21:44:51   Log-Likelihood:                -46.919
No. Observations:                           35   AIC:                             97.84
Df Residuals:                               33   BIC:                             100.9
Df Model:                                    1                                         
Covariance Type:                     nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
-----------------------

In [14]:
from stargazer.stargazer import Stargazer
from IPython.core.display import HTML

stargazer = Stargazer([national_results, local_results])
# Rename the models
stargazer.rename_covariates({'const': 'Intercept'})

# Change model names
stargazer.custom_columns(['National Tweets', 'Local Tweets'], [1, 1])
HTML(stargazer.render_html())

0,1,2
,,
,,
,National Tweets,Local Tweets
,(1),(2)
,,
Intercept,-0.502*,0.502*
,(0.286),(0.286)
winner_margin_for_runners_up,2.517**,-2.517**
,(1.187),(1.187)
Observations,35,35


In [15]:
with open('regression_table.html', 'w') as f:
    f.write(stargazer.render_html())