In [None]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

import math
import warnings

from IPython.display import display

import statsmodels.formula.api as smf
import statsmodels.api as sm

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import ensemble
from sklearn.preprocessing import StandardScaler  
from sklearn.svm import SVC
from sklearn import svm
from sklearn import linear_model

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

np.random.seed(seed=456613)


# Suppress annoying harmless error.
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)



In [None]:
#Display preferences
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams.update({'font.size': 22})


In [None]:
ballot = pd.read_csv('test_data_2.csv')
ballot.head()

In [None]:
ballot.isnull().sum()

In [None]:
ballot.shape

In [None]:
ballot.describe()

In [None]:
plt.figure(figsize=(20,10))
sns.distplot(ballot['birth_year'])
plt.title('Birth Year')
plt.xlabel('Birth Year')
plt.ylabel('Frequency')
plt.show()



In [None]:
plt.figure(figsize=(20,10))
fig = plt.figure()


fig.add_subplot(221)
sns.distplot(ballot['income_pct'])
plt.title('Income percent')
plt.xlabel('income percent')
plt.ylabel('Frequency')

fig.add_subplot(222)
sns.distplot(ballot['activist_pct'])
plt.title('Activist')
plt.xlabel('activist percent')
plt.ylabel('Frequency')


plt.show()

**Overwhelmingly vote is associated with party. Dems vote no while Repubs vote yes.**

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(x = 'target', hue = 'party', data=ballot)
plt.title('Vote by Party')
plt.xlabel('Vote')
plt.ylabel('Frequency')
plt.show()

**Gender is split for yes votes. The majority of no voters are women.**

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(x = 'target', hue = 'gender', data=ballot)
plt.title('Vote by gender')
plt.xlabel('Vote')
plt.ylabel('Frequency')
plt.show()

**Yes voters tend to report they are not ideological whereas no voters tend to report they are ideological.**

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(x = 'target', hue = 'ideo', data=ballot)
plt.title('Vote by ideology')
plt.xlabel('Vote')
plt.ylabel('Frequency')
plt.show()

**Republicans in general and particularly women are a bit older on average than democratic voters.  Republican women tend to be older than Republican men voters.**

In [None]:
plt.figure(figsize=(20,10))
ax = sns.boxplot(x='party', y='birth_year', hue='gender', data=ballot)
plt.title('Age and Party by Gender')
sns.despine(offset=10, trim=True)
ax.set(xlabel='', ylabel='birth_year')
plt.show()

**Black Republicans are the youngest group of voters.**

In [None]:
plt.figure(figsize=(20,10))
ax_1 = sns.boxplot(x='party', y='birth_year', hue='race_4', data=ballot)
plt.title('Age and Party by Race')
sns.despine(offset=10, trim=True)
ax.set(xlabel='', ylabel='birth_year')
plt.show()

**Democrates have more college education than Republicans.**

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(x = 'edu', hue = 'party', data=ballot)
plt.title('Party by Education')
plt.xlabel('Education')
plt.ylabel('Frequency')
plt.show()

**In the Democratic party the majority of voters are white women.  In all racial categories in the Democratic party women are the majority of voters.  In the Republican party  men are the majority of voters, although the Republican have more gender balance than Democrats.  Black republicans make up a very small portion of voters.**  

In [None]:
plt.figure(figsize=(20,10))
sns.catplot(x = 'race_4', hue = 'gender', col = 'party', data=ballot, kind = 'count')
plt.title('Party by Race and Gender')
plt.show()


**When accounting for race and education , Republican white voters have more education than white Democrats.  Black and Latinx Democratic voters have more education than their Republican counterparts. With the largest educational difference among Black voters**

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(x = 'race_4', hue = 'party', data=ballot)
plt.title('Party by Race')
plt.xlabel('Race')
plt.ylabel('Frequency')
plt.show()

**Most voters report either strong ideology or weak ideology - fewer voters find themselves in the middle.**

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(x = 'ideo', hue = 'party', data=ballot)
plt.title('Ideology by Party')
plt.xlabel('Ideology')
plt.ylabel('Frequency')
plt.show()

**There is no relationship between income, race and activist.**

In [None]:
plt.figure(figsize=(20,10))
sns.scatterplot(x = 'activist_pct', y= 'income_pct', hue = 'race_4', data=ballot)
plt.title('Activist by income and race')
plt.show()

**There is no relationship between Activist and income**

In [None]:
plt.figure(figsize=(20,10))
sns.regplot(x = 'activist_pct', y= 'income_pct', data=ballot)
plt.title('Activist by income')
plt.show()

**Black Republicans (who voted yes) tend to have lower income**

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(x='target', y='income_pct', hue='race_4', data=ballot)
plt.title('Vote by income and race')
plt.show()

**People who votes yes tend to be more identified with some sort of activism than those who voted no.  The is true across parties**

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(x='target', y='activist_pct', hue='party', data=ballot)
plt.title('Vote by activist and party')
plt.show()

In [None]:
corrmat = ballot.corr()
print(corrmat)

In [None]:
plt.figure(figsize=(20,10))
fig,ax = plt.subplots()
sns.heatmap(ballot.corr(), ax=ax, annot=True, linewidths=0.05, fmt= '.2f',cmap="magma")
plt.show()

In [None]:
def age_range(ballot):
    if (ballot.birth_year >= 1940) & (ballot.birth_year <= 1964):
        ballot['generation'] = 'Boomer'
    elif (ballot.birth_year >=1965) & (ballot.birth_year <=1979):
        ballot['generation'] = 'Gen_X'
    else: ballot['generation'] = 'Millenial'

    return ballot

ballot = ballot.apply(age_range, axis=1)
ballot.head()

**Baby boomers are more likely to be voters.  Among Democrats, women across all generations are more likely to be voters. Among Milenials in the Democratic party gender difference matters less.  Among Republicans gender matters less except for Milenials in the Republican party where men are more likely to vote.**  

In [None]:
plt.figure(figsize=(20,10))
sns.catplot(x = 'generation', hue = 'gender', col = 'party', data=ballot, kind = 'count')
plt.title('Party by Generation and Gender')
plt.show()

In [None]:
ballot = pd.get_dummies(ballot, columns=['gender','party','edu'], drop_first = True)

In [None]:
ballot.head()

In [None]:
ballot = pd.get_dummies(ballot, columns=['race_4'])

In [None]:
ballot.head()

In [None]:
ballot = pd.get_dummies(ballot, columns=['generation'])

In [None]:
ballot.head()

In [None]:
#drop non to only see known vote
ballot = ballot.dropna()

In [None]:
ballot.head()

In [None]:
ballot.drop('key', axis=1, inplace=True)

In [None]:
ballot.head()

In [None]:
ballot.columns

# Logistic Regression

In [None]:
#define X and target variable
X = ballot.drop(['target', 'birth_year'], 1)
y = ballot['target']

In [None]:
# Fit the model
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

In [None]:
lr = LogisticRegression(C=1e9)
X = ballot.drop(['target', 'birth_year'], 1)
y = ballot['target']


# Fit the model.
fit = lr.fit(X, y)

# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lr.predict(X)

print('\n Accuracy by vote status')
print(pd.crosstab(pred_y_sklearn, y))

print('\n Percentage accuracy')
print(lr.score(X, y))

scores = cross_val_score(lr, X, y, cv=10)

print(scores)
print(scores.mean())

##  mean score and the 95% confidence interval of the score estimate
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
#See what features are most important 
feature_importance = abs(lr.coef_[0])
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

featfig = plt.figure()
featax = featfig.add_subplot(1, 1, 1)
featax.barh(pos, feature_importance[sorted_idx], align='center')
featax.set_yticks(pos)
featax.set_yticklabels(np.array(X.columns)[sorted_idx], fontsize=8)
featax.set_xlabel('Relative Feature Importance')

plt.tight_layout()   
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
#dropping statistically insignificant features

# Model Used For The Test

In [None]:
X_1 = ballot.drop(['target', 'birth_year', 'race_4_Black', 'race_4_Latinx', 'race_4_Other', 'race_4_White', 'generation_Boomer',
                  'generation_Gen_X' ,'generation_Millenial'], 1)
y = ballot['target']

In [None]:
logit_model=sm.Logit(y,X_1)
result=logit_model.fit()
print(result.summary2())

In [None]:
# Fit the model.
fit = lr.fit(X_1, y)

# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lr.predict(X_1)

print('\n Accuracy by vote status')
print(pd.crosstab(pred_y_sklearn, y))

print('\n Percentage accuracy')
print(lr.score(X_1, y))

scores = cross_val_score(lr, X_1, y, cv=10)

print(scores)
print(scores.mean())

##  mean score and the 95% confidence interval of the score estimate
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
feature_importance = abs(lr.coef_[0])
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

featfig = plt.figure()
featax = featfig.add_subplot(1, 1, 1)
featax.barh(pos, feature_importance[sorted_idx], align='center')
featax.set_yticks(pos)
featax.set_yticklabels(np.array(X.columns)[sorted_idx], fontsize=8)
featax.set_xlabel('Relative Feature Importance')

plt.tight_layout()   
plt.show()

In [None]:
X_1train, X_1test, y_train, y_test = train_test_split(X_1, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_1train, y_train)

In [None]:
y_pred = logreg.predict(X_1test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_1test, y_test)))

In [None]:
print(classification_report(y_test, y_pred))

# F1 score: 0.83

# Random Forest

In [None]:
rfc = ensemble.RandomForestClassifier()
X = ballot.drop(['target', 'birth_year'], 1)
y = ballot['target']
rfc.fit(X,y)

In [None]:
scores = cross_val_score(rfc, X, y, cv=10)

print(scores)
print(scores.mean())

##  mean score and the 95% confidence interval of the score estimate
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
# Split the data into 40% test and 60% training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [None]:
feat_labels = ['ideo', 'employ', 'income_pct', 'activist_pct',
       'gender_male', 'party_Rep', 'edu_non-college', 'race_4_Black',
       'race_4_Latinx', 'race_4_Other', 'race_4_White']

In [None]:
for feature in zip(feat_labels, rfc.feature_importances_):
    print(feature)

## Most important features are Political Party and Ideology

In [None]:
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.15
sfm = SelectFromModel(rfc, threshold=0.15)

# Train the selector
sfm.fit(X_train, y_train)

In [None]:
# Transform the data to create a new dataset containing only the most important features
# Note: Apply the transform to both the training X and test X data.
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

In [None]:
# Create a new random forest classifier for the most important features
rfc_important = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the new classifier on the new dataset containing the most important features
rfc_important.fit(X_important_train, y_train)

In [None]:
# Apply The Full Featured Classifier To The Test Data
y_important_pred = rfc_important.predict(X_important_test)

# View The Accuracy Of Our Limited Feature (2 Features) Model
accuracy_score(y_test, y_important_pred)

# SVM

In [None]:
#Fit model - Support Vector Machine
X = ballot.drop(['target', 'birth_year'], 1)
y = ballot['target']
svm = SVC(kernel = 'linear')
svm.fit(X, y)

In [None]:
svm.score(X, y)

In [None]:
scores = cross_val_score(svm, X, y, cv=10)

print(scores)
print(scores.mean())

##  mean score and the 95% confidence interval of the score estimate
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
def f_importances(coef, names, top=-1):
    imp = coef
    imp, names = zip(*sorted(list(zip(imp, names))))

    # Show all features
    if top == -1:
        top = len(names)

    plt.barh(range(top), imp[::-1][0:top], align='center')
    plt.yticks(range(top), names[::-1][0:top])
    plt.show()

features_names = ['ideo', 'employ', 'income_pct', 'activist_pct',
       'gender_male', 'party_Rep', 'edu_non-college', 'race_4_Black',
       'race_4_Latinx', 'race_4_Other', 'race_4_White']
svm = SVC(kernel = 'linear')
svm.fit(X, y)

# Specify top n features to visualize.
f_importances(abs(svm.coef_[0]), features_names, top=10)