### Import Necessary Libraries

In [1]:
# for data import and management
import pandas as pd
import numpy as np

from collections import Counter

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, roc_curve
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import StratifiedKFold

# ploting
import matplotlib.pyplot as plt

In [2]:
# Read data from google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Read Datasets

In [3]:
# # read data
# df2012 = pd.read_excel("/content/drive/MyDrive/FYP/FINAL_DATA_1/Excels (Checked)/xlsx/election2012.xlsx")
# df2016 = pd.read_excel("/content/drive/MyDrive/FYP/FINAL_DATA_1/Excels (Checked)/xlsx/election2016.xlsx")
# df2020 = pd.read_excel("/content/drive/MyDrive/FYP/FINAL_DATA_1/Excels (Checked)/xlsx/election2020.xlsx")

# df2012 = pd.read_excel("/content/drive/MyDrive/FYP/FINAL_DATA_1/Excels (polling = Oct.)/election2012b")
# df2016 = pd.read_excel("/content/drive/MyDrive/FYP/FINAL_DATA_1/Excels (polling = Oct.)/election2016b")
# df2020 = pd.read_excel("/content/drive/MyDrive/FYP/FINAL_DATA_1/Excels (polling = Oct.)/election2020b")

df2012 = pd.read_excel("/content/drive/MyDrive/FYP/FINAL_DATA_1/Excels (Checked)/PNvT/election2012c")
df2016 = pd.read_excel("/content/drive/MyDrive/FYP/FINAL_DATA_1/Excels (Checked)/PNvT/election2016c")
df2020 = pd.read_excel("/content/drive/MyDrive/FYP/FINAL_DATA_1/Excels (Checked)/PNvT/election2020c")

## Feature Engineering (for 2020)

In [4]:
trainset = pd.concat([df2012, df2016]).reset_index(drop=True)

# train set
features_1216 = trainset.iloc[:,2:23]
labels_1216 = trainset['non_onehot']
# test set
features_2020 = df2020.iloc[:,2:23]
labels_2020 = df2020['non_onehot']

In [5]:
TRAIN = pd.concat([df2012, df2016, df2020]).reset_index(drop=True)

# Data Normalization

In [60]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
X = TRAIN.iloc[:,2:23]
Y = TRAIN['non_onehot']

X_ws = TRAIN.iloc[:,2:19]
Y_ws = TRAIN['non_onehot']

scaler = StandardScaler()

X = scaler.fit_transform(X)
X_ws = scaler.fit_transform(X_ws)

train = scaler.fit_transform(features_1216)
train_y = labels_1216

test = scaler.fit_transform(features_2020)
test_y = labels_2020

### Evaluation Method

In [7]:
def eval(pred_labels):
  Final = df2020[['state','Candidates']].copy()  # Create a copy of the DataFrame
  Final['True'] = test_y  # Use assignment without loc to set values
  Final['Pred'] = pred_labels  # Use assignment without loc to set values

  true_0_sum = Final[Final['True'] == 0]['Candidates'].sum()
  true_1_sum = Final[Final['True'] == 1]['Candidates'].sum()
  pred_0_sum = Final[Final['Pred'] == 0]['Candidates'].sum()
  pred_1_sum = Final[Final['Pred'] == 1]['Candidates'].sum()

  print("The true votes of Democrat ({0}), and Republican({1}) \n".format(true_0_sum, true_1_sum))

  print("The predicted votes of Democrat ({0}), and Republican({1}) \n".format(pred_0_sum, pred_1_sum))

  # create a boolean mask to select rows where 'True' and 'Pred' are not equal
  mask = (Final['True'] != Final['Pred'])

  # use the boolean mask to select the rows from the DataFrame
  result = Final[mask]

  # print the selected rows
  print(result)
  print('\n')

# Model Predictions

## XGBoost

In [8]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Use the best parameters to train the final model
xgb = XGBClassifier(max_depth=2,
                    gamma=2,
                    eta=0.8,
                    reg_alpha=0.5,
                    reg_lambda=0.5)

xgb.fit(train, train_y)

In [9]:
scores = cross_val_score(xgb, X, Y, cv=10)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.93 accuracy with a standard deviation of 0.06


In [10]:
y_pred = xgb.predict_proba(test)
threshold = 0.9
prob_array = np.where(y_pred > threshold, 1, 0)
y_pred = prob_array[:, 1]

In [11]:
# Calculate and print the classification report
classification_report_result = classification_report(test_y, y_pred)
print("Classification Report:")
print(classification_report_result)

eval(y_pred)

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.92      0.96        26
           1       0.93      1.00      0.96        25

    accuracy                           0.96        51
   macro avg       0.96      0.96      0.96        51
weighted avg       0.96      0.96      0.96        51

The true votes of Democrat (306), and Republican(232) 

The predicted votes of Democrat (279), and Republican(259) 

      state  Candidates  True  Pred
2   Arizona          11     0     1
10  Georgia          16     0     1




## Decision Tree

In [12]:
from sklearn import tree
from sklearn.feature_selection import RFE

dt = tree.DecisionTreeClassifier()

dt = dt.fit(train, train_y)

In [13]:
scores = cross_val_score(dt, X, Y, cv=10)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.95 accuracy with a standard deviation of 0.05


In [14]:
y_pred = dt.predict(test)

# Calculate and print the classification report
classification_report_result = classification_report(test_y, y_pred)
print("Classification Report:")
print(classification_report_result)

eval(y_pred)

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.81      0.89        26
           1       0.83      1.00      0.91        25

    accuracy                           0.90        51
   macro avg       0.92      0.90      0.90        51
weighted avg       0.92      0.90      0.90        51

The true votes of Democrat (306), and Republican(232) 

The predicted votes of Democrat (233), and Republican(305) 

           state  Candidates  True  Pred
2        Arizona          11     0     1
10       Georgia          16     0     1
22      Michigan          16     0     1
38  Pennsylvania          20     0     1
49     Wisconsin          10     0     1




## Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf.fit(train, train_y)

In [16]:
scores = cross_val_score(rf, X_ws, Y_ws, cv=10)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.94 accuracy with a standard deviation of 0.06


In [17]:
y_pred = rf.predict_proba(test)
threshold = 0.9
prob_array = np.where(y_pred > threshold, 1, 0)
y_pred = prob_array[:, 1]

In [18]:
# Calculate and print the classification report
classification_report_result = classification_report(test_y, y_pred)
print("Classification Report:")
print(classification_report_result)

eval(y_pred)

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.96      0.78        26
           1       0.92      0.48      0.63        25

    accuracy                           0.73        51
   macro avg       0.79      0.72      0.71        51
weighted avg       0.79      0.73      0.71        51

The true votes of Democrat (306), and Republican(232) 

The predicted votes of Democrat (433), and Republican(105) 

             state  Candidates  True  Pred
1           Alaska           3     1     0
3         Arkansas           6     1     0
9          Florida          29     1     0
10         Georgia          16     0     1
15            Iowa           6     1     0
17        Kentucky           8     1     0
33  North Carolina          15     1     0
34    North Dakota           3     1     0
35            Ohio          18     1     0
41    South Dakota           3     1     0
43           Texas          38     1     0
44            Uta

### Importance

In [19]:
importances = pd.DataFrame({'feature':features_1216.columns,'importance':np.round(rf.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')

print(importances)

                                                    importance
feature                                                       
Polling_Democrat(%)                                      0.238
Polling_Republican(%)                                    0.139
Management, business, science, and arts occupat...       0.119
Median Family Income                                     0.078
Production, transportation, and material moving...       0.073
DNC_neg                                                  0.062
35-59 years (%)                                          0.036
Natural resources, construction, and maintenanc...       0.031
DNC_pos                                                  0.029
Service occupations                                      0.028
PI_Q3                                                    0.028
Unemployment Rate                                        0.021
20-34 years (%)                                          0.019
GOP_pos                                                

## Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

logreg = LogisticRegression(max_iter=1000,
                            random_state = 42)

logreg.fit(train, train_y)

In [21]:
y_pred = logreg.predict_proba(test)
threshold = 0.9
prob_array = np.where(y_pred > threshold, 1, 0)
y_pred = prob_array[:, 1]

In [22]:
# Calculate and print the classification report
classification_report_result = classification_report(test_y, y_pred)
print("Classification Report:")
print(classification_report_result)

eval(y_pred)

Classification Report:
              precision    recall  f1-score   support

           0       0.84      1.00      0.91        26
           1       1.00      0.80      0.89        25

    accuracy                           0.90        51
   macro avg       0.92      0.90      0.90        51
weighted avg       0.92      0.90      0.90        51

The true votes of Democrat (306), and Republican(232) 

The predicted votes of Democrat (397), and Republican(141) 

             state  Candidates  True  Pred
1           Alaska           3     1     0
9          Florida          29     1     0
33  North Carolina          15     1     0
43           Texas          38     1     0
44            Utah           6     1     0




## Naive Bayes

In [23]:
# training the model on training set
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

gnb.fit(train, train_y)

In [24]:
y_pred = gnb.predict_proba(test)
threshold = 0.9
prob_array = np.where(y_pred > threshold, 1, 0)
y_pred = prob_array[:, 1]

In [25]:
# Calculate and print the classification report
classification_report_result = classification_report(test_y, y_pred)
print("Classification Report:")
print(classification_report_result)

eval(y_pred)

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.85      0.88        26
           1       0.85      0.92      0.88        25

    accuracy                           0.88        51
   macro avg       0.88      0.88      0.88        51
weighted avg       0.88      0.88      0.88        51

The true votes of Democrat (306), and Republican(232) 

The predicted votes of Democrat (320), and Republican(218) 

        state  Candidates  True  Pred
2     Arizona          11     0     1
9     Florida          29     1     0
10    Georgia          16     0     1
22   Michigan          16     0     1
43      Texas          38     1     0
49  Wisconsin          10     0     1




## SVC

In [76]:
from sklearn import svm

svc = svm.SVC(kernel='rbf',
              gamma='auto',
              random_state = 42,
              probability=True)

svc.fit(train, train_y)

In [73]:
from sklearn.model_selection import ShuffleSplit

svc.fit(X, Y)

cv = ShuffleSplit(n_splits=10, test_size=0.4, random_state=42)
scores = cross_val_score(svc, X, Y, cv=cv)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.89 accuracy with a standard deviation of 0.02


In [77]:
y_pred = svc.predict_proba(test)
threshold = 0.9
prob_array = np.where(y_pred > threshold, 1, 0)
y_pred = prob_array[:, 1]

In [78]:
# Calculate and print the classification report
classification_report_result = classification_report(test_y, y_pred)
print("Classification Report:")
print(classification_report_result)

eval(y_pred)

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.96      0.88        26
           1       0.95      0.76      0.84        25

    accuracy                           0.86        51
   macro avg       0.88      0.86      0.86        51
weighted avg       0.88      0.86      0.86        51

The true votes of Democrat (306), and Republican(232) 

The predicted votes of Democrat (372), and Republican(166) 

           state  Candidates  True  Pred
1         Alaska           3     1     0
9        Florida          29     1     0
22      Michigan          16     0     1
41  South Dakota           3     1     0
43         Texas          38     1     0
44          Utah           6     1     0
50       Wyoming           3     1     0




## MLP

In [64]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(solver='adam',
                    alpha=0.1,
                    hidden_layer_sizes=(7,),
                    random_state=42,
                    max_iter=1000)

mlp.fit(train, train_y)

In [69]:
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=10, test_size=0.4, random_state=42)
scores = cross_val_score(mlp, X_ws, Y_ws, cv=cv)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.89 accuracy with a standard deviation of 0.02


In [56]:
y_pred = mlp.predict_proba(test)
threshold = 0.9
prob_array = np.where(y_pred > threshold, 1, 0)
y_pred = prob_array[:, 1]

In [57]:
# Calculate and print the classification report
classification_report_result = classification_report(test_y, y_pred)
print("Classification Report:")
print(classification_report_result)

eval(y_pred)

Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96        26
           1       1.00      0.92      0.96        25

    accuracy                           0.96        51
   macro avg       0.96      0.96      0.96        51
weighted avg       0.96      0.96      0.96        51

The true votes of Democrat (306), and Republican(232) 

The predicted votes of Democrat (338), and Republican(200) 

     state  Candidates  True  Pred
1   Alaska           3     1     0
9  Florida          29     1     0




## Ensemble

In [None]:
from sklearn.ensemble import VotingClassifier

xgb = XGBClassifier(max_depth=2,
                    gamma=2,
                    eta=0.8,
                    reg_alpha=0.5,
                    reg_lambda=0.5)

dt = tree.DecisionTreeClassifier(random_state=42)

rf = RandomForestClassifier(n_estimators = 500,
                            random_state = 42,
                            criterion='log_loss',
                            class_weight= {0: 2, 1: 1})

logreg = LogisticRegression(max_iter=1000,
                            random_state = 42)

gnb = GaussianNB()

svc = svm.SVC(kernel='rbf',
              gamma='auto',
              random_state = 42,
              probability=True)

mlp = MLPClassifier(solver='adam',
                    alpha=0.1,
                    hidden_layer_sizes=(7,),
                    random_state=42,
                    max_iter=1000)

# Create the voting classifier ensemble
ensemble = VotingClassifier(estimators=[
    ('xgb', xgb),
    ('dt', dt),
    ('rf', rf),
    ('logreg', logreg),
    ('gnb', gnb),
    ('svc', svc),
    ('mlp', mlp),],
    voting = 'soft')

ensemble.fit(train, train_y)

In [None]:
y_pred = ensemble.predict_proba(test)

threshold = 0.9
prob_array = np.where(y_pred > threshold, 1, 0)
y_pred = prob_array[:, 1]

In [None]:
# Calculate and print the classification report
classification_report_result = classification_report(test_y, y_pred)
print("Classification Report:")
print(classification_report_result)

eval(y_pred)

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

y_pred = ensemble.predict_proba(test)[:, 1]

plt.figure(figsize=(7, 6))  # Set the plot size - 7 inches wide, 6 inches high
fpr, tpr, thresholds = roc_curve(test_y, y_pred)
roc_auc = auc(fpr, tpr)  # Calculate the AUC
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')

plt.plot([0, 1], [0, 1], '--')
plt.xlabel('False Positive Rate', fontsize=12)  # Increase x-label font size
plt.ylabel('True Positive Rate', fontsize=12)  # Increase y-label font size
plt.xticks(fontsize=10)  # Increase x-axis tick font size
plt.yticks(fontsize=10)  # Increase y-axis tick font size
plt.legend(loc='lower right')  # Add legend
plt.show()

In [None]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.neural_network import MLPClassifier

# Define the classifiers
xgb = XGBClassifier(max_depth=2, gamma=2, eta=0.8, reg_alpha=0.5, reg_lambda=0.5)
rf = RandomForestClassifier(n_estimators=500, random_state=42, criterion='log_loss',
                            class_weight={0: 2, 1: 1})
logreg = LogisticRegression(max_iter=1000, random_state=42)
gnb = GaussianNB()
svc = svm.SVC(kernel='rbf', gamma='auto', random_state=42, probability=True)
mlp = MLPClassifier(solver='adam', alpha=0.1, hidden_layer_sizes=(7,), random_state=42, max_iter=1000)

# Fit the classifiers on the training data
xgb.fit(train, train_y)
rf.fit(train, train_y)
logreg.fit(train, train_y)
gnb.fit(train, train_y)
svc.fit(train, train_y)
mlp.fit(train, train_y)

# Predict probabilities on the test data
xgb_probs = xgb.predict_proba(test)[:, 1]
rf_probs = rf.predict_proba(test)[:, 1]
logreg_probs = logreg.predict_proba(test)[:, 1]
gnb_probs = gnb.predict_proba(test)[:, 1]
svc_probs = svc.predict_proba(test)[:, 1]
mlp_probs = mlp.predict_proba(test)[:, 1]

# Create a DataFrame with model names and predicted probabilities
data = {
    'Model': ['Gradient Boosting Trees', 'Naive Bayes', 'SVC', 'Feedforward Neural Network'],
    'Predicted Probabilities': [xgb_probs, gnb_probs, svc_probs, mlp_probs]
}

df = pd.DataFrame(data)

print(df)

In [None]:
# Create an empty figure
plt.figure(figsize=(7, 6))  # Set the plot size - 7 inches wide, 6 inches high

# Iterate over the rows of the DataFrame
for index, row in df.iterrows():
    model_name = row['Model']
    y_pred = row['Predicted Probabilities']

    # Calculate the ROC curve and AUC
    fpr, tpr, thresholds = roc_curve(test_y, y_pred)
    roc_auc = auc(fpr, tpr)

    # Plot the ROC curve
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.2f})')

# Add the diagonal line
plt.plot([0, 1], [0, 1], '--')

# Set labels, ticks, and legend
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.legend(loc='lower right', fontsize = 10)

# Save the plot as EPS with tight margins
plt.savefig('ROC.eps', format='eps', bbox_inches='tight')

# Show the plot
plt.show()