# Project- 5

### Import Libraries

In [273]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder

### Load Data

In [274]:
# read data
df=pd.read_csv('./data/2018_gov.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,dist,cong,gender,birthyr,age,educ,race,faminc,marstat,newsint,approval_gov,ideo5,voted_gov
0,392782,19,115,Female,1964,54,4-Year,White,50k - 60k,Single / Never Married,Most of the time,Strongly Approve,Liberal,[Democrat / Candidate 1]
1,392786,52,115,Female,1990,28,Some College,White,20k - 30k,Single / Never Married,Most of the time,Strongly Approve,Liberal,[Democrat / Candidate 1]
2,392800,49,115,Male,1932,86,Post-Grad,White,120k - 150k,Widowed,Most of the time,Strongly Disapprove,Very Conservative,[Republican / Candidate 2]
3,392832,36,115,Male,1938,80,Some College,White,20k - 30k,Widowed,Most of the time,Strongly Disapprove,Conservative,[Republican / Candidate 2]
4,392860,36,115,Male,1935,83,4-Year,White,Prefer not to say,Married,Most of the time,Strongly Disapprove,Conservative,[Republican / Candidate 2]


In [275]:
df.shape

(3752, 14)

In [276]:
# drop unnamed: 0, birthyr and cong columns 
# (age column and birthyr show the same information, cong has one category so can't be used as a variable)
df=df.drop(columns=['Unnamed: 0', 'birthyr', 'cong'])

In [277]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3752 entries, 0 to 3751
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   dist          3752 non-null   int64 
 1   gender        3752 non-null   object
 2   age           3752 non-null   int64 
 3   educ          3752 non-null   object
 4   race          3752 non-null   object
 5   faminc        3752 non-null   object
 6   marstat       3752 non-null   object
 7   newsint       3752 non-null   object
 8   approval_gov  3752 non-null   object
 9   ideo5         3752 non-null   object
 10  voted_gov     3752 non-null   object
dtypes: int64(2), object(9)
memory usage: 322.6+ KB


In [278]:
# baseline accuracy
df['voted_gov'].value_counts(normalize=True)

[Democrat / Candidate 1]       0.612207
[Republican / Candidate 2]     0.367537
I Did Not Vote In This Race    0.009861
Not Sure                       0.006663
Other                          0.003731
Name: voted_gov, dtype: float64

Baseline accuracy is 61.2%. We have unimbalanced data.

## Pre-processing

In [279]:
# transform voted_gov column to categorical dtype and save it to a new column
df['voted_gov_num'] = pd.Categorical(
    df['voted_gov'], 
    categories= ['[Democrat / Candidate 1]', 
                 '[Republican / Candidate 2]', 
                 'I Did Not Vote In This Race', 
                 'Not Sure', 
                 'Other']
)
codes, uniques = pd.factorize(df['voted_gov_num'])
df['voted_gov_num']=codes
df.head()

Unnamed: 0,dist,gender,age,educ,race,faminc,marstat,newsint,approval_gov,ideo5,voted_gov,voted_gov_num
0,19,Female,54,4-Year,White,50k - 60k,Single / Never Married,Most of the time,Strongly Approve,Liberal,[Democrat / Candidate 1],0
1,52,Female,28,Some College,White,20k - 30k,Single / Never Married,Most of the time,Strongly Approve,Liberal,[Democrat / Candidate 1],0
2,49,Male,86,Post-Grad,White,120k - 150k,Widowed,Most of the time,Strongly Disapprove,Very Conservative,[Republican / Candidate 2],1
3,36,Male,80,Some College,White,20k - 30k,Widowed,Most of the time,Strongly Disapprove,Conservative,[Republican / Candidate 2],1
4,36,Male,83,4-Year,White,Prefer not to say,Married,Most of the time,Strongly Disapprove,Conservative,[Republican / Candidate 2],1


In [280]:
# set up X and y
X = df.drop(columns =['voted_gov', 'voted_gov_num'])
y = df['voted_gov_num']

In [281]:
X= pd.get_dummies(X, drop_first=True)

In [282]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, stratify=y)

## Modeling

#### RandomForest

In [283]:
rf = RandomForestClassifier()

In [284]:
et = ExtraTreesClassifier()

In [285]:
cross_val_score(rf, X_train, y_train, cv=5).mean()

0.8901891873099752

In [286]:
cross_val_score(et, X_train, y_train, cv=5).mean()

0.8862802854560281

Random forest and extremely randomized trees performed pretty much the same, random forest is slightly better.

In [175]:
rf.get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [267]:
# gridsearch 
rf_params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 1, 2, 3, 4, 5],
    'max_features': ['sqrt', .5],
    'bootstrap': [True, False],
    'random_state': [42]
}

gs_rf = GridSearchCV(rf, rf_params, cv=5, n_jobs=-1, verbose=1)
gs_rf.fit(X_train, y_train)
print(gs.best_score_)
gs_rf.best_params_

Fitting 5 folds for each of 72 candidates, totalling 360 fits
0.8951657048222852


{'bootstrap': True,
 'max_depth': None,
 'max_features': 0.5,
 'n_estimators': 100,
 'random_state': 42}

In [291]:
# gridsearch optimization
print('best score:', gs_rf.best_score_)
print('best parameters:', gs_rf.best_params_)
print('accuracy on train data:', gs_rf.score(X_train, y_train))
print('accuracy on test data:', gs_rf.score(X_val, y_val))

best score: 0.8948104650354292
best parameters: {'bootstrap': True, 'max_depth': None, 'max_features': 0.5, 'n_estimators': 100, 'random_state': 42}
accuracy on train data: 0.9996446339729922
accuracy on test data: 0.8901918976545842


It is overfit.

In [288]:
# get feature importances
pd.Series(gs_rf.best_estimator_.feature_importances_, X.columns).sort_values(ascending=False)

approval_gov_Strongly Disapprove                 0.451253
age                                              0.064271
dist                                             0.059220
approval_gov_Strongly Approve                    0.046886
ideo5_Liberal                                    0.043347
ideo5_Very Conservative                          0.035639
ideo5_Very Liberal                               0.030772
approval_gov_Disapprove / Somewhat Disapprove    0.029810
ideo5_Moderate                                   0.018984
race_White                                       0.013969
gender_Male                                      0.012325
ideo5_Not Sure                                   0.010158
race_Black                                       0.010144
educ_4-Year                                      0.009864
marstat_Single / Never Married                   0.009439
marstat_Married                                  0.008832
newsint_Most of the time                         0.008799
race_Hispanic 

-  For a classfication random forest model, feature_importances_ is based on the average decrease in Gini impurity over the decision trees. 
- Strongly disapprove category of approval_gov feature has a 45.1% importance in the random forest model. For this variable, individuals were asked "Do you approve of the way each is doing their job". 
- approval_gov_Strongly Approve has a 4.7% importance.
- Age has 6.4% and dist has 5.9%. (dist is Congressional district number in current Congress)

In [289]:
# approval_gov column categories
df['approval_gov'].value_counts()

Strongly Disapprove                 1197
Approve / Somewhat Approve          1128
Strongly Approve                     895
Disapprove / Somewhat Disapprove     385
Never Heard / Not Sure               147
Name: approval_gov, dtype: int64