#Election Results from the 1992 Presidential Election

This dataset contains county-level demographics and whether or not Bill Clinton won each county during the 1992 U.S presidential election. The goal for this dataset is to successfully predict if Bill Clinton won a county using the demographic variables.

Data from Larry Winner and Jared Thacker.
https://www.kaggle.com/datasets/jwt0024/1992-us-presidential-election

In [None]:
import numpy as np
import pandas as pd

#Read the election data

In [None]:
election_data = pd.read_csv("election_dataset.csv")
election_data

In [None]:
# Check the split of the data

wins = len(np.where(election_data.clinton_win)[0])
losses = len(np.where(election_data.clinton_win == False)[0])
print(f'Wins: {wins}, Losses: {losses}')

In [None]:
# Separate the features from the class
# Also, drop the county_name and state columns - this model will focus on demographic data only

features = election_data.drop(['clinton_win', 'county_name', 'state'], axis=1)
features.T

In [None]:
# Just select one of the classes for predictions

predictions = election_data['clinton_win']
predictions

In [None]:
# Binarize the predictions

predictions = pd.get_dummies(predictions)
predictions

In [None]:
# Only use the True column from the predictions (the False column is redundant)

predictions = predictions[0]
predictions

# Visualize the data
Check for clustering, if any (the better clustered the data, the better the classification model is likely to be)

In [None]:
import graphviz
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

visual_model = TSNE(learning_rate = 100, init='pca') # t-Distributed Stochastic Neighbor Embedding visualization - preserves clustering of data
visual_transformation = visual_model.fit_transform(features)
x_data = visual_transformation[:, 0]
y_data = visual_transformation[:, 1]
transformed_data = pd.DataFrame({'x':x_data, 'y':y_data})

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(transformed_data.loc[predictions==0]['x'], transformed_data.loc[predictions==0]['y'], c= 'tab:red')
plt.scatter(transformed_data.loc[predictions==1]['x'], transformed_data.loc[predictions==1]['y'], c= 'tab:blue')
plt.legend(loc ='lower left', labels = ['Lose', 'Win'])

plt.show()

# Build and test a Decision Tree model

In [None]:
from sklearn.model_selection import train_test_split

features_train, features_test, predictions_train, predictions_test = train_test_split(features, predictions, test_size=0.33, random_state=13)
print(f'features_train: {features_train.shape}\npredictions_train: {predictions_train.shape}')

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
_ = tree_model.fit(features_train, predictions_train)

In [None]:
# Examine the decision tree

from sklearn.tree import export_graphviz

view_data = export_graphviz(tree_model, out_file=None, feature_names=features.columns, class_names=['Lose', 'Win'])
graph = graphviz.Source(view_data)
graph

#Test the model

In [None]:
tree_test_results = tree_model.predict(features_test)

In [None]:
# Compare the test results to the predicted results

print(f'Test Results: {tree_test_results[0:100]}\n')
print(f'Predictions: {predictions_test[0:100].values}')

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

_ = ConfusionMatrixDisplay.from_predictions(predictions_test, tree_test_results, display_labels=['Lose', 'Win'])

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(tree_test_results, predictions_test)
errors = tree_test_results.size - accuracy_score(tree_test_results, predictions_test, normalize=False)

print(f'Accuracy Score: {accuracy}\n')
print(f'Number of classification errors: {errors}')

In [None]:
# Generate the ROC

from sklearn import metrics

_ = metrics.RocCurveDisplay.from_predictions(tree_test_results, predictions_test)

In [None]:
# Show the probabilities for each prediction made by the model

tree_model.predict_proba(features_test)[0:50]

#Perform cross-validation on the training data with 10 folds

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(DecisionTreeClassifier(), features_train, predictions_train, cv=10, scoring='accuracy')

#Build, test, and score a model built using a random forest

In [None]:
# Use the same training and test data as before
from sklearn.ensemble import RandomForestClassifier

forest_model = RandomForestClassifier(n_estimators=100)
_ = forest_model.fit(features_train, predictions_train)

In [None]:
forest_test_results = forest_model.predict(features_test)
forest_test_results == predictions_test.values

In [None]:
_ = ConfusionMatrixDisplay.from_predictions(predictions_test, forest_test_results, display_labels=['Lose', 'Win'])

In [None]:
accuracy = accuracy_score(forest_test_results, predictions_test)
errors = forest_test_results.size - accuracy_score(forest_test_results, predictions_test, normalize=False)

print(f'Accuracy Score: {accuracy}\n')
print(f'Number of classification errors: {errors}')

In [None]:
_ = metrics.RocCurveDisplay.from_predictions(forest_test_results, predictions_test)

In [None]:
forest_model.predict_proba(features_test)[0:50]

In [None]:
cross_val_score(RandomForestClassifier(n_estimators=100), features_train, predictions_train, cv=10, scoring='accuracy')

#Build, test, and score a model built using a gradient boosted tree

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbt_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
_ = gbt_model.fit(features_train, predictions_train)

In [None]:
gbt_test_results = gbt_model.predict(features_test)
gbt_test_results == predictions_test.values

In [None]:
_ = ConfusionMatrixDisplay.from_predictions(predictions_test, gbt_test_results, display_labels=['Lose', 'Win'])

In [None]:
accuracy = accuracy_score(gbt_test_results, predictions_test)
errors = gbt_test_results.size - accuracy_score(gbt_test_results, predictions_test, normalize=False)

print(f'Accuracy Score: {accuracy}\n')
print(f'Number of classification errors: {errors}')

In [None]:
_ = metrics.RocCurveDisplay.from_predictions(gbt_test_results, predictions_test)

In [None]:
gbt_model.predict_proba(features_test)[0:50]

In [None]:
cross_val_score(GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3), features_train, predictions_train, cv=10, scoring='accuracy')

#Build, test, and score a model built using Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logit_model = LogisticRegression(max_iter=200, solver="lbfgs", tol=1e-4)
_ = logit_model.fit(features_train, predictions_train)

In [None]:
logit_test_results = logit_model.predict(features_test)
logit_test_results == predictions_test.values

In [None]:
_ = ConfusionMatrixDisplay.from_predictions(predictions_test, logit_test_results, display_labels=['Lose', 'Win'])

In [None]:
accuracy = accuracy_score(logit_test_results, predictions_test)
errors = logit_test_results.size - accuracy_score(logit_test_results, predictions_test, normalize=False)

print(f'Accuracy Score: {accuracy}\n')
print(f'Number of classification errors: {errors}')

In [None]:
_ = metrics.RocCurveDisplay.from_predictions(logit_test_results, predictions_test)

In [None]:
logit_model.predict_proba(features_test)[0:50]

In [None]:
cross_val_score(LogisticRegression(max_iter=200, solver="liblinear", tol=1e-4), features_train, predictions_train, cv=10, scoring='accuracy')

#Build, test, and score a model built using the K-Nearest Neighbours algorithm

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
_ = knn_model.fit(features_train, predictions_train)

In [None]:
knn_test_results = knn_model.predict(features_test)
knn_test_results == predictions_test.values

In [None]:
_ = ConfusionMatrixDisplay.from_predictions(predictions_test, knn_test_results, display_labels=['Lose', 'Win'])

In [None]:
accuracy = accuracy_score(knn_test_results, predictions_test)
errors = knn_test_results.size - accuracy_score(knn_test_results, predictions_test, normalize=False)

print(f'Accuracy Score: {accuracy}\n')
print(f'Number of classification errors: {errors}')

In [None]:
_ = metrics.RocCurveDisplay.from_predictions(knn_test_results, predictions_test)

In [None]:
knn_model.predict_proba(features_test)[0:50]

In [None]:
cross_val_score(KNeighborsClassifier(n_neighbors=5), features_train, predictions_train, cv=10, scoring='accuracy')

#Generalized framework for testing classification algorithms

In [None]:
def test_model(model, features_train, predictions_train, features_test, predictions_test):
  _ = model.fit(features_train, predictions_train)
  test_results = model.predict(features_test)
  _ = ConfusionMatrixDisplay.from_predictions(predictions_test, test_results, display_labels=['Lose', 'Win'])
  accuracy = accuracy_score(test_results, predictions_test)
  errors = test_results.size - accuracy_score(test_results, predictions_test, normalize=False)

  print(f'Accuracy Score: {accuracy}\n')
  print(f'Number of classification errors: {errors}\n')

  _ = metrics.RocCurveDisplay.from_predictions(test_results, predictions_test)
  cross_val_score(model, features_train, predictions_train, cv=10, scoring='accuracy')

In [None]:
# Plug in the appropriate model to test and get the results

test_model(forest_model, features_train, predictions_train, features_test, predictions_test)

#Use the framework to build, test, and score a model built using the Gaussian Naive Bayes algorithm

In [None]:
from sklearn.naive_bayes import GaussianNB

bayes_model = GaussianNB()
_ = bayes_model.fit(features_train, predictions_train)

In [None]:
test_model(bayes_model, features_train, predictions_train, features_test, predictions_test)