In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, recall_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [2]:
def class_metrics(X_train, y_train, X_test, y_test, model):
    """
    This Function takes in X, and y train 
    test data and prints out three metrics
    that we want to optimize for.
    """
    #Training metrics
    y_preds_train = model.predict(X_train)
    sens_tr = recall_score(y_train, y_preds_train, pos_label=1)
    spec_tr = recall_score(y_train, y_preds_train, pos_label=0)
    train_acc = model.score(X_train, y_train)
    
    #Test metrics
    y_preds_test = model.predict(X_test)
    sens_tst = recall_score(y_test, y_preds_test, pos_label=1)
    spec_tst = recall_score(y_test, y_preds_test, pos_label=0)
    test_acc = model.score(X_test , y_test)
    
    print('Train Accuracy - Correctly Classified:', round(train_acc,2))
    print('Test Accuracy - Correctly Classified:', round(test_acc,2))
    
    print('\nTrain Sensitivity - True Positvive Rate:', round(sens_tr,2))
    print('Test Sensitivity - True Positvive Rate:', round(sens_tst,2))
    
    print('\nTrain Specificity - True Negative Rate:', round(spec_tr,2))
    print('Test Specificity - True Negative Rate:', round(spec_tst,2))
    

In [3]:
#read in csv of feature selected from EDA notbook.
df = pd.read_csv('2018_gov.csv', index_col='Unnamed: 0')
df.head()

Unnamed: 0,dist,gender,age,educ,race,faminc,marstat,newsint,approval_gov,ideo5,voted_gov
392782,19,Female,54,4-Year,White,50k - 60k,Single / Never Married,Most of the time,Strongly Approve,Liberal,[Democrat / Candidate 1]
392786,52,Female,28,Some College,White,20k - 30k,Single / Never Married,Most of the time,Strongly Approve,Liberal,[Democrat / Candidate 1]
392800,49,Male,86,Post-Grad,White,120k - 150k,Widowed,Most of the time,Strongly Disapprove,Very Conservative,[Republican / Candidate 2]
392832,36,Male,80,Some College,White,20k - 30k,Widowed,Most of the time,Strongly Disapprove,Conservative,[Republican / Candidate 2]
392860,36,Male,83,4-Year,White,Prefer not to say,Married,Most of the time,Strongly Disapprove,Conservative,[Republican / Candidate 2]


#### Feature Engineer the target column.

Well define our target to be the 'voted_gov' column. well have to clean this up this feature first.

In [4]:
# Value counts funtion shows we have more then 2 options for our target.
df['voted_gov'].value_counts()

[Democrat / Candidate 1]       2297
[Republican / Candidate 2]     1379
I Did Not Vote In This Race      37
Not Sure                         25
Other                            14
Name: voted_gov, dtype: int64

In [5]:
# Drop the other, not sure, and I did not vote in this race.
df = df[(df['voted_gov'] == '[Democrat / Candidate 1]') | (df['voted_gov'] == '[Republican / Candidate 2]')]

After some debate whether to encode the non Republican or Democrat catagroies and include in our target, we decided to drop them. The reasoning for this is that these categories represent around 3% of our target data, we feel that this is a heavily underrepresented class and our classification models wouldn't be able to classify them with any accuracy. Also, the purpose of this project is to accurately predict whether a Republican or Democrat would win the election we don't see these votes having an effect on our classification.

#### Dummify Catagorical Columns

In [6]:
# Use pd.get_dummies to dummify the catagoricalcolums.
df = pd.get_dummies(df, columns=['gender', 'educ', 'race', 'faminc', 'marstat', 'newsint', 'approval_gov','ideo5', 'voted_gov'], drop_first=True)

#### Create our Target column.

In [7]:
# Relabling our target colum to signify the positive class is voting republican.
df.rename(columns={'voted_gov_[Republican / Candidate 2]': 'Voted Republican'}, inplace=True)

In [8]:
# Create X and y variables
X = df[df.columns[:-1]]
y = df['Voted Republican']

# Create a train test split with a 30% test size.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, test_size=0.3)

### Grid Search: Logistic Regression / Knn / Niave Baise
---

Performed 3 Grid searches to find the best hyperparameters and we took those hyperparameters and added them to our final voting classifer, we used these models in order for our coefficcients to be interpretable. perhapes more value could be gained from these models.

In [9]:
# scale our data for a logistic regression model
lr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lr', LogisticRegression()),
])

# find the optimal hyperparameters.
pipe_params ={
    'lr__C' : [1.0, 0.1, 0.01, 0.001],
    'lr__penalty': ['l1','l2','none'],

}

# Gridsearch using the specified hyper parameters.
grid_model = GridSearchCV(
    lr_pipe,
    pipe_params,
    cv = 5,
    n_jobs = -1,
    verbose = 2
)

# Fit our training data to the Grisearch.
grid_model.fit(X_train, y_train)
print(grid_model.best_params_)
print(grid_model.best_estimator_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
{'lr__C': 0.1, 'lr__penalty': 'l2'}
Pipeline(steps=[('ss', StandardScaler()), ('lr', LogisticRegression(C=0.1))])


        nan 0.90633297 0.91178044        nan 0.89739111 0.91178044]


In [10]:
class_metrics(X_train, y_train, X_test,y_test, grid_model)

Train Accuracy - Correctly Classified: 0.92
Test Accuracy - Correctly Classified: 0.91

Train Sensitivity - True Positvive Rate: 0.87
Test Sensitivity - True Positvive Rate: 0.86

Train Specificity - True Negative Rate: 0.95
Test Specificity - True Negative Rate: 0.94


Ok results would hope to improve the sensitivity let not so over fit. lets try KNN and see the results.

In [11]:
# Create a pipleline to scale our data fro KNN Classification.
lr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('knn', KNeighborsClassifier()),
])


# find the optimal hyperparameters.
pipe_params ={
    'knn__n_neighbors' : [1 ,2, 3, 4, 5, 6, 7],
}

# Create a gridsearch model to find our Hyperparameters
grid_model = GridSearchCV(
    lr_pipe,
    pipe_params,
    cv = 5,
    n_jobs = -1,
    verbose = 2
)


# Fit our training data to the Grisearch.
grid_model.fit(X_train, y_train)
print(grid_model.best_params_)
print(grid_model.best_estimator_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
{'knn__n_neighbors': 7}
Pipeline(steps=[('ss', StandardScaler()),
                ('knn', KNeighborsClassifier(n_neighbors=7))])


In [12]:
class_metrics(X_train, y_train, X_test,y_test, grid_model)

Train Accuracy - Correctly Classified: 0.91
Test Accuracy - Correctly Classified: 0.84

Train Sensitivity - True Positvive Rate: 0.85
Test Sensitivity - True Positvive Rate: 0.73

Train Specificity - True Negative Rate: 0.94
Test Specificity - True Negative Rate: 0.9


In the above KNN model results seem to be a little overfit and some misclassification in the posisitve class. Run a Naive Baise model and evaluate the results.

In [43]:
# find the optimal hyperparameters.
grid_params ={
    'alpha': list(np.linspace(0.0, 1.0, num=5)),
    'fit_prior': [True, False]
}

# Fit our training data to the Grisearch.
grid_model = GridSearchCV(
    MultinomialNB(),
    grid_params,
    cv = 5,
    n_jobs = -1,
    verbose = 2
)

# Fit our training data to the Grisearch.
grid_model.fit(X_train, y_train)
print(grid_model.best_params_)
print(grid_model.best_estimator_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'alpha': 1.0, 'fit_prior': True}
MultinomialNB()


In [44]:
class_metrics(X_train, y_train, X_test,y_test, grid_model)

Train Accuracy - Correctly Classified: 0.92
Test Accuracy - Correctly Classified: 0.91

Train Sensitivity - True Positvive Rate: 0.89
Test Sensitivity - True Positvive Rate: 0.87

Train Specificity - True Negative Rate: 0.94
Test Specificity - True Negative Rate: 0.93


### Voting Classifer: Knn, Logistic Regression, MN Naive Baise
___

In [51]:
#Knn scaled pipeline
knn_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=7))
])


#logistic regression scaled pipeline.
lr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lr', LogisticRegression(C=0.01, penalty='l2'))
])

#Create our voting classifer with knn
# logisitic regression, and MN Naive Baise models.
vote = VotingClassifier([
    ('knn_pipe', knn_pipe),
    ('lr_pipe', lr_pipe),
    ('nb', MultinomialNB(alpha=1, fit_prior=True))
])

# Fit our training data to our Voting classifer.
vote.fit(X_train, y_train)

VotingClassifier(estimators=[('knn_pipe',
                              Pipeline(steps=[('ss', StandardScaler()),
                                              ('knn',
                                               KNeighborsClassifier(n_neighbors=7))])),
                             ('lr_pipe',
                              Pipeline(steps=[('ss', StandardScaler()),
                                              ('lr',
                                               LogisticRegression(C=0.01))])),
                             ('nb', MultinomialNB(alpha=1))])

In [52]:
class_metrics(X_train, y_train, X_test,y_test, vote)

Train Accuracy - Correctly Classified: 0.93
Test Accuracy - Correctly Classified: 0.92

Train Sensitivity - True Positvive Rate: 0.89
Test Sensitivity - True Positvive Rate: 0.87

Train Specificity - True Negative Rate: 0.96
Test Specificity - True Negative Rate: 0.94


The Above voting classifer has improved all three of our metrics. Throughout this initial process we have had trouble imporving our sensitivity since our target class is unevenly balanced. We should note that our negative class those that vote democrat is 62% of our target, and our positive class vote republican, has 37% of our target data this value reflects the split of regestered democrats to republucan in the state of california. Lets see if adding more relivant data hel

### Lets include more relevant features to our model to improve performance.

To imporove our model we decided to incorporate more features into our model to see it these would imporve our classification metrics. We will now incoorporate, home ownership, and party indentifier. 

In [17]:
# Including new feature dataframe, index is voter ID
df_more = pd.read_csv('2018_gov_features.csv', index_col='Unnamed: 0')
df_more.head()

Unnamed: 0,dist,ownhome,pid3,gender,age,educ,race,faminc,marstat,newsint,approval_gov,ideo5,voted_gov
392782,19,Other,Democrat,Female,54,4-Year,White,50k - 60k,Single / Never Married,Most of the time,Strongly Approve,Liberal,[Democrat / Candidate 1]
392786,52,Rent,Democrat,Female,28,Some College,White,20k - 30k,Single / Never Married,Most of the time,Strongly Approve,Liberal,[Democrat / Candidate 1]
392800,49,Own,Other,Male,86,Post-Grad,White,120k - 150k,Widowed,Most of the time,Strongly Disapprove,Very Conservative,[Republican / Candidate 2]
392832,36,Own,Republican,Male,80,Some College,White,20k - 30k,Widowed,Most of the time,Strongly Disapprove,Conservative,[Republican / Candidate 2]
392860,36,Own,Independent,Male,83,4-Year,White,Prefer not to say,Married,Most of the time,Strongly Disapprove,Conservative,[Republican / Candidate 2]


In [18]:
# Only include those who voted for democrat or republican.
df_more = df_more[(df_more['voted_gov'] == '[Democrat / Candidate 1]') | (df_more['voted_gov'] == '[Republican / Candidate 2]')]

# Dummify catagorical columns
df_more = pd.get_dummies(df_more, columns=['dist', 'ownhome', 'pid3', 'gender', 'educ', 'race', 'faminc', 'marstat', 'newsint', 'approval_gov','ideo5', 'voted_gov'], drop_first=True)

In [19]:
#changing our target column to better indicate target "Voted Republican"
df_more.rename(columns={'voted_gov_[Republican / Candidate 2]': 'Voted Republican'}, inplace=True)

In [20]:
# Create X and y variables
X = df_more[df_more.columns[:-1]]
y = df_more['Voted Republican']

# Train Test split our X and y Variables.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [21]:
# scale our data for a logistic regression model
lr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lr', LogisticRegression()),
])

# find the optimal hyperparameters.
pipe_params ={
    'lr__C' : [1.0, 0.1, 0.01, 0.001],
    'lr__penalty': ['l1','l2','none'],

}

# Gridsearch using the specified hyper parameters.
grid_model = GridSearchCV(
    lr_pipe,
    pipe_params,
    cv = 5,
    n_jobs = -1,
    verbose = 2
)

# Fit our training data to the Grisearch.
grid_model.fit(X_train, y_train)
print(grid_model.best_params_)
print(grid_model.best_estimator_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


        nan 0.92791985 0.92100282        nan 0.90825832 0.92100282]


{'lr__C': 0.01, 'lr__penalty': 'l2'}

In [22]:
class_metrics(X_train, y_train, X_test,y_test, grid_model)

Train Accuracy - Correctly Classified: 0.94
Test Accuracy - Correctly Classified: 0.92

Train Sensitivity - True Positvive Rate: 0.89
Test Sensitivity - True Positvive Rate: 0.86

Train Specificity - True Negative Rate: 0.97
Test Specificity - True Negative Rate: 0.96


In [23]:
# Create a pipleline to scale our data fro KNN Classification.
lr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('knn', KNeighborsClassifier()),
])


# find the optimal hyperparameters.
pipe_params ={
    'knn__n_neighbors' : [1 ,2, 3, 4, 5, 6, 7],
}

# Create a gridsearch model to find our Hyperparameters
grid_model = GridSearchCV(
    lr_pipe,
    pipe_params,
    cv = 5,
    n_jobs = -1,
    verbose = 2
)


# Fit our training data to the Grisearch.
grid_model.fit(X_train, y_train)
print(grid_model.best_params_)
print(grid_model.best_estimator_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
{'knn__n_neighbors': 5}
Pipeline(steps=[('ss', StandardScaler()), ('knn', KNeighborsClassifier())])


In [24]:
class_metrics(X_train, y_train, X_test,y_test, grid_model)

Train Accuracy - Correctly Classified: 0.88
Test Accuracy - Correctly Classified: 0.8

Train Sensitivity - True Positvive Rate: 0.81
Test Sensitivity - True Positvive Rate: 0.67

Train Specificity - True Negative Rate: 0.93
Test Specificity - True Negative Rate: 0.87


In [41]:
# Listing our Naive Baise hyperparameters.
grid_params ={
    'alpha': list(np.linspace(0.0, 1.0, num=5)),
    'fit_prior': [True, False]
}

# Fit our training data to the Grisearch.
grid_model = GridSearchCV(
    MultinomialNB(),
    grid_params,
    cv = 5,
    n_jobs = -1,
    verbose = 2
)

# Fit our training data to the Grisearch.
grid_model.fit(X_train, y_train)
print(grid_model.best_params_)
print(grid_model.best_estimator_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'alpha': 1.0, 'fit_prior': True}
MultinomialNB()


In [26]:
class_metrics(X_train, y_train, X_test,y_test, grid_model)

Train Accuracy - Correctly Classified: 0.92
Test Accuracy - Correctly Classified: 0.91

Train Sensitivity - True Positvive Rate: 0.89
Test Sensitivity - True Positvive Rate: 0.87

Train Specificity - True Negative Rate: 0.94
Test Specificity - True Negative Rate: 0.93


### Try again with the voting classifer

In [34]:
#Knn scaled pipeline
knn_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=5))
])


#logistic regression scaled pipeline.
lr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lr', LogisticRegression(C=0.01, penalty='l2'))
])


vote = VotingClassifier([
    ('knn_pipe', knn_pipe),
    ('lr_pipe', lr_pipe),
    ('nb', MultinomialNB(alpha=1, fit_prior=True))
    
])

vote.fit(X_train, y_train)

VotingClassifier(estimators=[('knn_pipe',
                              Pipeline(steps=[('ss', StandardScaler()),
                                              ('knn',
                                               KNeighborsClassifier())])),
                             ('lr_pipe',
                              Pipeline(steps=[('ss', StandardScaler()),
                                              ('lr',
                                               LogisticRegression(C=0.01))])),
                             ('nb', MultinomialNB(alpha=1))])

In [35]:
class_metrics(X_train, y_train, X_test,y_test, vote);

Train Accuracy - Correctly Classified: 0.93
Test Accuracy - Correctly Classified: 0.92

Train Sensitivity - True Positvive Rate: 0.89
Test Sensitivity - True Positvive Rate: 0.87

Train Specificity - True Negative Rate: 0.96
Test Specificity - True Negative Rate: 0.95


#### Lets look at coefficients.
Lets look into the coefficients of our model and see what determines a voter to vote democrat or republican

In [29]:
lr = LogisticRegression(max_iter=10_000)
lr.fit(X_train, y_train)
lr.score(X_train, y_train)

0.9377502730251183

In [30]:
coeffs = pd.DataFrame(lr.coef_[0], index=X.columns, columns=['coeffs'])
coeffs.sort_values(by= 'coeffs', ascending=False, inplace=True)
coeffs

Unnamed: 0,coeffs
approval_gov_Strongly Disapprove,3.475903
pid3_Republican,3.442369
pid3_Independent,1.625932
pid3_Other,1.273321
approval_gov_Disapprove / Somewhat Disapprove,1.197618
...,...
approval_gov_Strongly Approve,-1.174109
ideo5_Moderate,-1.204761
ideo5_Not Sure,-1.701976
ideo5_Liberal,-2.283260


Looking at the chart above of our top 5 coeffients and our bottom 5 coeffients. Its intresting to observe that a strong indicator of voting for the positive class "Republican" 

#### Lets take a look at the moderates.

In [31]:
probas = pd.DataFrame(lr.predict_proba(X), index=X.index)
mod_index = probas[(probas[0] < .6) & (probas[1] < .6)]

These are the selct the 119 that the model had lower than 60% probabality to classify.

In [32]:
X.loc[mod_index.index, :]

Unnamed: 0,age,dist_2,dist_3,dist_4,dist_5,dist_6,dist_7,dist_8,dist_9,dist_10,...,newsint_Some of the time,approval_gov_Disapprove / Somewhat Disapprove,approval_gov_Never Heard / Not Sure,approval_gov_Strongly Approve,approval_gov_Strongly Disapprove,ideo5_Liberal,ideo5_Moderate,ideo5_Not Sure,ideo5_Very Conservative,ideo5_Very Liberal
393977,84,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
395498,55,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
395945,50,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
397987,63,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,1,0,0,0
398253,59,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450050,33,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,1,0,0,0
450279,65,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,1,0,0,0,0
451378,50,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,1,0,0,0
452156,53,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
