In [62]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

In [63]:
dem_df = pd.read_csv('../data/post_dem_candidates_with_tweet_topics.csv', encoding="ISO-8859-1").dropna(subset=['primary_pctg'])
rep_df = pd.read_csv('../data/post_rep_candidates_with_tweet_topics.csv', encoding="ISO-8859-1").dropna(subset=['primary_pctg'])
sa_all_tweets = pd.read_csv('../data/tweet_data/sa_all_tweets_post.csv', index_col=False)

print(str(len(dem_df[(dem_df['won_primary'] == 'Yes') & (dem_df['num_tweets'] >= 10)])) + " : " + str(len(dem_df[dem_df['won_primary'] == 'Yes'])))
print(str(len(dem_df[(dem_df['won_primary'] == 'No') & (dem_df['num_tweets'] >= 10)])) + " : " + str(len(dem_df[dem_df['won_primary'] == 'No'])))

# Add these lines if you only want to include candidates who have tweeted 10+ times
dem_df = dem_df[dem_df['num_tweets'] >= 10]
rep_df = rep_df[rep_df['num_tweets'] >= 10]

all_tweets_no_topics = pd.read_csv('../data/tweet_data/all_tweets_cleaned.csv', index_col=False)
all_tweets_topics = pd.read_csv('../data/tweet_data/tweets_with_topics.csv', index_col=False)

rep_df.columns

188 : 202
467 : 543


Index(['candidate', 'twitter_handle', 'twitter_handle2', 'state', 'district',
       'office_type', 'race_type', 'race_primary_election_date',
       'primary_status', 'primary_runoff_status', 'general_status',
       'primary_pctg', 'won_primary', 'rep_party_support', 'trump_endorsed',
       'bannon_endorsed', 'great_america_endorsed', 'nra_endorsed',
       'right_to_life_endorsed', 'susan_b_anthony_endorsed',
       'club_for_growth_endorsed', 'koch_support', 'house_freedom_support',
       'tea_party_endorsed', 'main_street_endorsed', 'chamber_endorsed',
       'no_labels_support', 'has_twitter', 'num_endorsements',
       'gun_control_tweet_count', 'health_care_tweet_count',
       'abortion_tweet_count', 'immigration_tweet_count',
       'kavanaugh_tweet_count', 'education_tweet_count', 'jobs_tweet_count',
       'tax_tweet_count', 'fake_news_tweet_count', 'environment_tweet_count',
       'sexual_assault_tweet_count', 'gender_tweet_count', 'women_tweet_count',
       'lgbt_twee

In [64]:
# topics = ['gun_control', 'health_care',
#        'abortion', 'immigration', 'kavanaugh', 'education', 'jobs', 'tax',
#        'fake_news', 'environment', 'sexual_assault', 'gender', 'women', 'lgbt',
#        'freedom', 'trump']

topics = ['gun_control', 'health_care',
       'abortion', 'immigration', 'education', 'jobs'
          , 'environment','women', 'lgbt', 'freedom', 'trump']

In [65]:
rep_senators = rep_df[rep_df['office_type'] == 'Senator']
rep_house = rep_df[rep_df['office_type'] == 'Representative']
rep_govs = rep_df[rep_df['office_type'] == 'Governor']
dem_senators = dem_df[dem_df['office_type'] == 'Senator']
dem_house = dem_df[dem_df['office_type'] == 'Representative']
dem_govs = dem_df[dem_df['office_type'] == 'Governor']

In [66]:
len(dem_df[dem_df['twitter_handle'].isna() == True])/ len(dem_df)

0.11572700296735905

In [67]:
# % of candidates without twitter
print(len(dem_df[dem_df['has_twitter'] == False]) / len(dem_df))
print(len(rep_df[rep_df['has_twitter'] == False]) / len(rep_df))

0.11572700296735905
0.7415730337078652


In [68]:
# add sentiment scores
sa_all_tweets.head(5)

Unnamed: 0,tweet_id,stripped_text,twitter_handle,sentiment
0,0,With corruption being one of the biggest conce...,CA_Countryman,0
1,1,With corruption being one of the biggest conce...,CA_Countryman,0
2,2,Here is an important notice concerning Christo...,CA_Countryman,2
3,3,I believe that we need to find real solutions ...,CA_Countryman,2
4,4,"It's time to turn the page on Alabama's past, ...",CA_Countryman,2


## Encode data

In [69]:
drop_cols = ['primary_status', 'general_status', 'primary_runoff_status', 'twitter_handle', 'twitter_handle2']

# encode data
encode_dem_cols = ['won_primary', 'candidate','has_twitter', 'state',  'district'
                   , 'office_type', 'race_type', 'race', 'veteran', 'race_primary_election_date',
    'lgbtq', 'elected_official', 'self_funder', 'stem', 'obama_alum', 'dem_party_support', 'emily_endorsed',
    'gun_sense_candidate', 'biden_endorsed', 'warren_endorsed', 'sanders_endorsed', 'our_revolution_endorsed',
    'justice_dems_endorsed', 'pccc_endorsed', 'indivisible_endorsed', 'wfp_endorsed', 'votevets_endorsed', 'no_labels_support']

encode_rep_cols = ['won_primary','candidate', 'has_twitter', 'state',  'district'
                   , 'office_type', 'race_type', 'race_primary_election_date',
    'rep_party_support', 'trump_endorsed', 'bannon_endorsed', 'great_america_endorsed', 'nra_endorsed',
    'right_to_life_endorsed', 'susan_b_anthony_endorsed', 'club_for_growth_endorsed', 'koch_support', 'house_freedom_support',
    'tea_party_endorsed', 'main_street_endorsed','chamber_endorsed', 'no_labels_support']

topics_col = ['{}_tweet_count'.format(t) for t in topics]

model_dem_cols = encode_dem_cols.copy()
model_dem_cols.extend(['num_endorsements', 'num_tweets', 'sentiment'])
# model_dem_cols.extend(['num_endorsements', 'num_tweets', 'count_negative', 'count_neutral', 'count_positive', 'sentiment'])
model_dem_cols.extend(topics_col)

model_rep_cols = encode_rep_cols.copy()
model_rep_cols.extend(['num_endorsements', 'num_tweets', 'sentiment'])
# model_rep_cols.extend(['num_endorsements', 'num_tweets', 'count_negative', 'count_neutral', 'count_positive', 'sentiment'])
model_rep_cols.extend(topics_col)

label_encoder = LabelEncoder()
state_map = {}
def label_encode_data(df, encode_cols):
    for col in encode_cols:
        print(col)
        df[col] = label_encoder.fit_transform(df[col].values.astype(str))
        le_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
        if col == 'state':
            state_map = le_mapping
            print(state_map)
        if col not in ['candidate', 'state', 'district', 'race_primary_election_date']:
            print(le_mapping)

    return df

dem_df_enc = label_encode_data(dem_df, encode_dem_cols)
rep_df_enc = label_encode_data(rep_df, encode_rep_cols)

# dem_df_enc.to_csv('../data/encoded_dem_data.csv', index=False)
# rep_df_enc.to_csv('../data/encoded_rep_data.csv', index=False)


won_primary
{'No': 0, 'Yes': 1, 'nan': 2}
candidate
has_twitter
{'False': 0, 'True': 1}
state
{'AL': 0, 'AR': 1, 'AZ': 2, 'CA': 3, 'CO': 4, 'GA': 5, 'IA': 6, 'ID': 7, 'IL': 8, 'IN': 9, 'KS': 10, 'KY': 11, 'MD': 12, 'ME': 13, 'MI': 14, 'MO': 15, 'MS': 16, 'MT': 17, 'NC': 18, 'ND': 19, 'NE': 20, 'NJ': 21, 'NM': 22, 'NV': 23, 'NY': 24, 'OH': 25, 'OK': 26, 'OR': 27, 'PA': 28, 'SC': 29, 'SD': 30, 'TN': 31, 'TX': 32, 'UT': 33, 'VA': 34, 'WV': 35}
district
office_type
{'Governor': 0, 'Representative': 1, 'Senator': 2}
race_type
{'Regular': 0, 'Special': 1}
race
{'Nonwhite': 0, 'White': 1, 'nan': 2}
veteran
{'No': 0, 'Yes': 1, 'nan': 2}
race_primary_election_date
lgbtq
{'No': 0, 'Yes': 1, 'nan': 2}
elected_official
{'No': 0, 'Yes': 1, 'nan': 2}
self_funder
{'No': 0, 'Yes': 1}
stem
{'No': 0, 'Yes': 1, 'nan': 2}
obama_alum
{'No': 0, 'Yes': 1, 'nan': 2}
dem_party_support
{'No': 0, 'Yes': 1, 'nan': 2}
emily_endorsed
{'No': 0, 'Yes': 1, 'nan': 2}
gun_sense_candidate
{'No': 0, 'Yes': 1, 'nan': 2}
bi

In [71]:
dem_df_enc = dem_df_enc.drop(columns=['count_negative', 'count_neutral', 'count_positive'])
dem_df_enc.columns.values

array(['candidate', 'twitter_handle', 'twitter_handle2', 'state',
       'district', 'office_type', 'race_type',
       'race_primary_election_date', 'primary_status',
       'primary_runoff_status', 'general_status', 'partisan_lean',
       'primary_pctg', 'won_primary', 'race', 'veteran', 'lgbtq',
       'elected_official', 'self_funder', 'stem', 'obama_alum',
       'dem_party_support', 'emily_endorsed', 'gun_sense_candidate',
       'biden_endorsed', 'warren_endorsed', 'sanders_endorsed',
       'our_revolution_endorsed', 'justice_dems_endorsed',
       'pccc_endorsed', 'indivisible_endorsed', 'wfp_endorsed',
       'votevets_endorsed', 'no_labels_support', 'has_twitter',
       'num_endorsements', 'gun_control_tweet_count',
       'health_care_tweet_count', 'abortion_tweet_count',
       'immigration_tweet_count', 'kavanaugh_tweet_count',
       'education_tweet_count', 'jobs_tweet_count', 'tax_tweet_count',
       'fake_news_tweet_count', 'environment_tweet_count',
       'sexual

## Democratic - mixedlm

In [72]:
dem_df_enc.head(5)

Unnamed: 0,candidate,twitter_handle,twitter_handle2,state,district,office_type,race_type,race_primary_election_date,primary_status,primary_runoff_status,...,fake_news_tweet_count,environment_tweet_count,sexual_assault_tweet_count,gender_tweet_count,women_tweet_count,lgbt_tweet_count,freedom_tweet_count,trump_tweet_count,num_tweets,sentiment
1,106,CA_Countryman,,0,0,0,0,8,Lost,,...,0,8,0,0,0,10,1,15,614,2.628664
3,248,jamesfields_al,,0,0,0,0,8,Lost,,...,0,1,0,0,0,0,0,0,83,3.373494
4,594,SueBellCobb,,0,0,0,0,8,Lost,,...,0,3,0,0,1,0,2,0,392,3.285714
5,651,WaltMaddox,,0,0,0,0,8,Advanced,,...,0,2,0,0,0,0,5,1,1256,3.372611
6,397,LizzettaMcConn1,,0,21,1,0,8,Lost,,...,0,0,0,0,0,0,0,0,116,3.37931


In [73]:
dem_x_cols = " + ".join(model_dem_cols[1:])
dem_model_data = dem_df_enc[model_dem_cols] # includes candidate
print(len(dem_model_data.columns))
print(len(dem_x_cols.split('+')))

print(len(dem_model_data['won_primary']))
print(len(dem_model_data['candidate']))

print(dem_x_cols.split('+'))
print(dem_model_data.columns)

model = smf.mixedlm("won_primary ~ " + dem_x_cols, dem_model_data, groups=dem_model_data["candidate"]).fit()
print(model.summary())
model.summary().tables[1].to_csv('dem_mixedlm_won_primary.csv')

42
41
674
674
['candidate ', ' has_twitter ', ' state ', ' district ', ' office_type ', ' race_type ', ' race ', ' veteran ', ' race_primary_election_date ', ' lgbtq ', ' elected_official ', ' self_funder ', ' stem ', ' obama_alum ', ' dem_party_support ', ' emily_endorsed ', ' gun_sense_candidate ', ' biden_endorsed ', ' warren_endorsed ', ' sanders_endorsed ', ' our_revolution_endorsed ', ' justice_dems_endorsed ', ' pccc_endorsed ', ' indivisible_endorsed ', ' wfp_endorsed ', ' votevets_endorsed ', ' no_labels_support ', ' num_endorsements ', ' num_tweets ', ' sentiment ', ' gun_control_tweet_count ', ' health_care_tweet_count ', ' abortion_tweet_count ', ' immigration_tweet_count ', ' education_tweet_count ', ' jobs_tweet_count ', ' environment_tweet_count ', ' women_tweet_count ', ' lgbt_tweet_count ', ' freedom_tweet_count ', ' trump_tweet_count']
Index(['won_primary', 'candidate', 'has_twitter', 'state', 'district',
       'office_type', 'race_type', 'race', 'veteran',
       'r

IndexError: index 671 is out of bounds for axis 0 with size 596

### w/ primary pctg

In [12]:
# model_dem_cols2 = encode_dem_cols[1:].copy() # remove won_primary
# model_dem_cols2.extend(['num_endorsements', 'num_tweets','primary_pctg'])
# # topics_col = ['{}_tweet_count'.format(t) for t in topics]
# # model_dem_cols2.extend(topics_col)

# dem_x_cols2 = " + ".join(model_dem_cols2[2:])
# # dem_x_cols2 += ' + num_endorsements'
# # data_cols2 = encode_dem_cols[1:].copy() # remove won_primary, has candidate
# # data_cols2.extend(['num_endorsements', 'primary_pctg'])
# dem_model_data2 = dem_df_enc[model_dem_cols2]

# print(model_dem_cols2)
# # print(len(dem_x_cols2.split('+')))
# # print(len(dem_model_data2.columns))
# # print(len(dem_model_data2))
# # print(len(dem_model_data2['candidate']))
# # print(len(dem_model_data2['primary_pctg']))

In [13]:
# # NOTE: would not converge with added topics & sentiment

# model = smf.mixedlm("primary_pctg ~ " + dem_x_cols2, dem_model_data2, groups=dem_model_data2["candidate"]).fit()
# results_summary = model.summary()
# dem_mixedlm_primary_pctg = results_summary.tables[1]
# print(model.summary())

In [14]:
# #  {'Governor': 0, 'Representative': 1, 'Senator': 2}
# exclude_cols = set(['office_type', 'primary_pctg'])
# c = [e for e in model_dem_cols if e not in exclude_cols]
# # print(c)
# model_cols = " + ".join(c)
# print(model_cols)
# dem_senators_model_data = dem_df_enc[dem_df_enc['office_type'] == 1][c]
# model = smf.mixedlm("won_primary ~ " + model_cols, dem_senators_model_data, groups=dem_senators_model_data["candidate"]).fit()
# results_summary = model.summary()
# dem_mixedlm_won_primary = results_summary.tables[1]
# print(results_summary)

## Republican - mixedlm

In [15]:
rep_df_enc.columns

Index(['candidate', 'twitter_handle', 'twitter_handle2', 'state', 'district',
       'office_type', 'race_type', 'race_primary_election_date',
       'primary_status', 'primary_runoff_status', 'general_status',
       'primary_pctg', 'won_primary', 'rep_party_support', 'trump_endorsed',
       'bannon_endorsed', 'great_america_endorsed', 'nra_endorsed',
       'right_to_life_endorsed', 'susan_b_anthony_endorsed',
       'club_for_growth_endorsed', 'koch_support', 'house_freedom_support',
       'tea_party_endorsed', 'main_street_endorsed', 'chamber_endorsed',
       'no_labels_support', 'has_twitter', 'num_endorsements',
       'gun_control_tweet_count', 'health_care_tweet_count',
       'abortion_tweet_count', 'immigration_tweet_count',
       'kavanaugh_tweet_count', 'education_tweet_count', 'jobs_tweet_count',
       'tax_tweet_count', 'fake_news_tweet_count', 'environment_tweet_count',
       'sexual_assault_tweet_count', 'gender_tweet_count', 'women_tweet_count',
       'lgbt_twee

In [16]:
rep_x_cols = " + ".join(model_rep_cols[1:])
rep_model_data = rep_df_enc[model_rep_cols] # includes candidate

In [17]:
model = smf.mixedlm("won_primary ~ " + rep_x_cols, rep_model_data, groups=rep_model_data["candidate"]).fit()
results_summary = model.summary()
rep_mixedlm_won_primary = results_summary.tables[1]
results_summary.tables[1].to_csv('rep_mixedlm_won_primary.csv')
print(results_summary)

                Mixed Linear Model Regression Results
Model:                MixedLM     Dependent Variable:     won_primary
No. Observations:     534         Method:                 REML       
No. Groups:           521         Scale:                  0.0008     
Min. group size:      1           Likelihood:             -311.3076  
Max. group size:      2           Converged:              Yes        
Mean group size:      1.0                                            
---------------------------------------------------------------------
                           Coef.  Std.Err.   z    P>|z| [0.025 0.975]
---------------------------------------------------------------------
Intercept                  -0.879    0.215 -4.099 0.000 -1.300 -0.459
candidate                  -0.000    0.000 -0.263 0.792 -0.000  0.000
has_twitter                 0.119    0.154  0.770 0.441 -0.184  0.422
state                       0.002    0.003  0.793 0.428 -0.003  0.007
district                   -0.000   

In [18]:
# model_rep_cols2 = encode_rep_cols[1:].copy()
# model_rep_cols2.extend(['num_endorsements', 'num_tweets', 'primary_pctg'])
# model_rep_cols2.extend(topics_col)

# rep_model_data = rep_df_enc[model_rep_cols2] 
# dem_x_cols2 = " + ".join(model_rep_cols2[2:])
# dem_x_cols2 += ' + num_endorsements'

In [19]:
# model = smf.mixedlm("primary_pctg ~ " + dem_x_cols2, rep_model_data, groups=rep_model_data["candidate"]).fit()
# results_summary = model.summary()
# rep_mixedlm_primary_pctg = results_summary.tables[1]
# print(model.summary())

# Understanding Democratic Factors

- Examine factors with low p-values
- Of those factors, which ones actually have substantial data available?

In [73]:
print(dem_mixedlm_won_primary.columns)
# dem_mixedlm_won_primary.drop(dem_mixedlm_won_primary.tail(1).index,inplace=True) # drops last, useless column
dem_mixedlm_won_primary['P>|z|'] = dem_mixedlm_won_primary['P>|z|'].astype('float')
low_p = dem_mixedlm_won_primary[dem_mixedlm_won_primary['P>|z|'] < 0.05]
low_p.sort_values('Coef.', ascending=False)

NameError: name 'dem_mixedlm_won_primary' is not defined

In [56]:
binary_col = ['has_twitter', 'veteran', 'lgbtq','elected_official', 'self_funder', 'stem',
       'obama_alum', 'emily_endorsed', 'race',
       'gun_sense_candidate', 'biden_endorsed', 'warren_endorsed',
       'sanders_endorsed', 'our_revolution_endorsed', 'justice_dems_endorsed',
       'pccc_endorsed', 'indivisible_endorsed', 'wfp_endorsed',
       'votevets_endorsed', 'no_labels_support']

results = []

for col in binary_col:
    num_yes = len(dem_df_enc[dem_df_enc[col] == 1])
    num_no = len(dem_df_enc[dem_df_enc[col] == 0])
    num_total = len(dem_df_enc)
    results.append([col
                    , round(num_yes / num_total, 3)
                    , round(num_no / num_total, 3)
                    , round((num_total - (num_yes + num_no))/num_total, 3)])

empty_ratio = pd.DataFrame(results, columns=['col_name', 'num_yes', 'num_no', 'num_none'])
empty_ratio.sort_values('num_none')

Unnamed: 0,col_name,num_yes,num_no,num_none
0,has_twitter,0.881,0.119,0.0
4,self_funder,0.053,0.947,0.0
6,obama_alum,0.043,0.956,0.001
1,veteran,0.154,0.832,0.014
2,lgbtq,0.048,0.937,0.014
3,elected_official,0.166,0.82,0.014
5,stem,0.175,0.811,0.014
8,race,0.553,0.25,0.197
9,gun_sense_candidate,0.237,0.228,0.535
13,our_revolution_endorsed,0.106,0.302,0.592


In [57]:
# low_p_cols = low_p.index.tolist() # ommitted "race_primary" col - all of it is filled out

# yes_ratio = []
# no_ratio = []
# none_ratio = []
# for c in low_p_cols:
#     print(c)
#     print(empty_ratio[empty_ratio['col_name'] == c].num_yes.values[0])
#     yes_ratio.append(empty_ratio[empty_ratio['col_name'] == c].num_yes.values[0])
#     no_ratio.append(empty_ratio[empty_ratio['col_name'] == c].num_no.values[0])
#     none_ratio.append(empty_ratio[empty_ratio['col_name'] == c].num_none.values[0])

In [None]:
# low_p['num_none'] = none_ratio
# low_p['num_yes'] = yes_ratio
# low_p['num_no'] = no_ratio
# low_p.sort_values('num_none')

In [None]:
# total_dems = len(dem_df)
# # print(total_dems)
# print('previously elected official & won: ', round(len(dem_df[(dem_df['elected_official'] == 1) & (dem_df['won_primary'] == 1)])  / total_dems, 2))
# print('previously elected official & lost: ', round(len(dem_df[(dem_df['elected_official'] == 1) & (dem_df['won_primary'] == 0)]) / total_dems, 2))
# print('not previously elected official & lost: ', round(len(dem_df[(dem_df['elected_official'] == 0) & (dem_df['won_primary'] == 0)]) / total_dems, 2))
# print('not prexviously elected official & won: ', round(len(dem_df[(dem_df['elected_official'] == 0) & (dem_df['won_primary'] == 1)]) / total_dems, 2))
# print('-----')
# print('stem & won: ', round(len(dem_df[(dem_df['stem'] == 1) & (dem_df['won_primary'] == 1)])  / total_dems, 2))
# print('stem & lost: ', round(len(dem_df[(dem_df['stem'] == 1) & (dem_df['won_primary'] == 0)]) / total_dems, 2))
# print('not stem & lost: ', round(len(dem_df[(dem_df['stem'] == 0) & (dem_df['won_primary'] == 0)]) / total_dems, 2))
# print('not stem & won: ', round(len(dem_df[(dem_df['stem'] == 0) & (dem_df['won_primary'] == 1)]) / total_dems, 2))
# print('-----')
# print('white & won: ', round(len(dem_df[(dem_df['race'] == 1) & (dem_df['won_primary'] == 1) ])  / total_dems, 2))
# print('white & lost: ', round(len(dem_df[(dem_df['race'] == 1) & (dem_df['won_primary'] == 0)]) / total_dems, 2))
# print('non-white & lost: ', round(len(dem_df[(dem_df['race'] == 0) & (dem_df['won_primary'] == 0)]) / total_dems, 2))
# print('non-white & won: ', round(len(dem_df[(dem_df['race'] == 0) & (dem_df['won_primary'] == 1)]) / total_dems, 2))

# print('-----')
# print('white & won: ', round(len(dem_df[(dem_df['race'] == 1) 
#                                         & (dem_df['won_primary'] == 1) 
#                                         & (dem_df['emily_endorsed'] == 1) ])  / total_dems, 2))
# print('white & lost: ', round(len(dem_df[(dem_df['race'] == 1) & (dem_df['won_primary'] == 0)
#                                         & (dem_df['emily_endorsed'] == 1)]) / total_dems, 2))
# print('non-white & lost: ', round(len(dem_df[(dem_df['race'] == 0) & (dem_df['won_primary'] == 0)
#                                             & (dem_df['emily_endorsed'] == 1)]) / total_dems, 2))
# print('non-white & won: ', round(len(dem_df[(dem_df['race'] == 0) & (dem_df['won_primary'] == 1)
#                                            & (dem_df['emily_endorsed'] == 1)]) / total_dems, 2))







In [None]:
# # emily endorsed?
# print("total emily endorsed: ", len(dem_df[(dem_df['emily_endorsed'] == 1)]))
# print("emily endorsed & won: ", len(dem_df[(dem_df['emily_endorsed'] == 1) & (dem_df['won_primary'] == 1)]))
# print('---')
# print("emily endorsed & white: ", len(dem_df[(dem_df['emily_endorsed'] == 1) & (dem_df['race'] == 0)]))
# print("emily endorsed & non-white: ", len(dem_df[(dem_df['emily_endorsed'] == 1) & (dem_df['race'] == 1)]))
# print('\n==========')
# print("total NOT emily endorsed: ", len(dem_df[(dem_df['emily_endorsed'] == 0)]))
# print("NOT emily endorsed & won: ", len(dem_df[(dem_df['emily_endorsed'] == 0) & (dem_df['won_primary'] == 1)]))

# print('\nstates where emily endorsed:')
# states = []
# print(state_map)
# for s in dem_df[(dem_df['emily_endorsed'] == 1)].state.unique():
#     states.append(state_map[s])
# print(states)
# # print(dem_df[(dem_df['emily_endorsed'] == 1)].state.unique())

### conclusions
- existence of twitter_handle is strongest signal
    - is this because it's the most filled out?
- elected official
    - non-previously elected officials (696) performed better than previous elected (150), though group size different
    - not a huge coef. makes sense
- stem
    - is stem bad..?
    - stem is bad :(
- emily endorsd
    - high win percentage if you're emily endorsed, high loss % if you were not
    - but only 30% of candidates were endorsed/not-endorsed

# Republican Factors

In [None]:
print(rep_mixedlm_won_primary.columns)
# rep_mixedlm_won_primary.drop(rep_mixedlm_won_primary.tail(1).index,inplace=True) # drops last, useless column
rep_mixedlm_won_primary['P>|z|'] = rep_mixedlm_won_primary['P>|z|'].astype('float')
low_p = rep_mixedlm_won_primary[rep_mixedlm_won_primary['P>|z|'] < 0.07]
low_p.sort_values('Coef.', ascending=False)

In [None]:
binary_col = [ 'has_twitter',
       'rep_party_support', 'trump_endorsed', 'bannon_endorsed',
       'great_america_endorsed', 'nra_endorsed', 'right_to_life_endorsed',
       'susan_b_anthony_endorsed', 'club_for_growth_endorsed', 'koch_support',
       'house_freedom_support', 'tea_party_endorsed', 'main_street_endorsed',
       'chamber_endorsed', 'no_labels_support']

results = []

for col in binary_col:
    num_yes = len(rep_df[rep_df[col] == 1])
    num_no = len(rep_df[rep_df[col] == 0])
    num_total = len(rep_df)
    results.append([col
                    , round(num_yes / num_total, 3)
                    , round(num_no / num_total, 3)
                    , round((num_total - (num_yes + num_no))/num_total,3)])

r = pd.DataFrame(results, columns=['col_name', 'num_yes', 'num_no', 'num_none'])
r.sort_values('num_none')

In [None]:
# dem_mixedlm_won_primary.to_csv('../mixed_lm_results/dem_ml_won_primary.csv')
# dem_mixedlm_primary_pctg.to_csv('../mixed_lm_results/dem_mixedlm_primary_pctg.csv')
# rep_mixedlm_won_primary.to_csv('../mixed_lm_results/rep_mixedlm_won_primary.csv')
# rep_mixedlm_primary_pctg.to_csv('../mixed_lm_results/rep_mixedlm_primary_pctg.csv')