# Cluster US Presidential elections 1788-2024
Here, we will use machine learning to re-examine the party system in the United States through the lens of presidential election results across states.

In [114]:
import numpy as np
import pandas as pd
from sklearn import base, pipeline, impute, preprocessing, decomposition, cluster, metrics, model_selection
from typing import Any, Callable

import seaborn as sns

## 0. Load election data

In [10]:
####################
# Load election data
####################
# Load data
state_results = pd.read_csv("../data/state_election_history.csv")
national_results = pd.read_csv("../data/national_election_history.csv")

# Prep state name as abbreviation
state_abbrev_map = {
        "Alaska": "AK",
        "Alabama": "AL",
        "Arkansas": "AR",
        "Arizona": "AZ",
        "California": "CA",
        "Colorado": "CO",
        "Connecticut": "CT",
        "District of Columbia": "DC",
        "Delaware": "DE",
        "Florida": "FL",
        "Georgia": "GA",
        "Hawaii": "HI",
        "Iowa": "IA",
        "Idaho": "ID",
        "Illinois": "IL",
        "Indiana": "IN",
        "Kansas": "KS",
        "Kentucky": "KY",
        "Louisiana": "LA",
        "Massachusetts": "MA",
        "Maryland": "MD",
        "Maine": "ME",
        "Michigan": "MI",
        "Minnesota": "MN",
        "Missouri": "MO",
        "Mississippi": "MS",
        "Montana": "MT",
        "North Carolina": "NC",
        "North Dakota": "ND",
        "Nebraska": "NE",
        "New Hampshire": "NH",
        "New Jersey": "NJ",
        "New Mexico": "NM",
        "Nevada": "NV",
        "New York": "NY",
        "Ohio": "OH",
        "Oklahoma": "OK",
        "Oregon": "OR",
        "Pennsylvania": "PA",
        "Rhode Island": "RI",
        "South Carolina": "SC",
        "South Dakota": "SD",
        "Tennessee": "TN",
        "Texas": "TX",
        "Utah": "UT",
        "Virginia": "VA",
        "Vermont": "VT",
        "Washington": "WA",
        "Wisconsin": "WI",
        "West Virginia": "WV",
        "Wyoming": "WY",
        "Puerto Rico": "PR",
        "Virigin Islands": "VI"
    }
state_results["state_full"] = state_results["state"]
state_results["state"] = state_results["state"].map(state_abbrev_map)

# Drop 2024 results for now
national_results = national_results[national_results.election != 2024]

display(state_results.head())
display(national_results.head())

Unnamed: 0,election,state,federalist,anti_federalist,democratic_republican,democratic,national_republican,whig,republican,third_party,NOTE,state_full
0,1788,CT,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"No popular vote. Instead, vote by state legisl...",Connecticut
1,1788,DE,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Delaware
2,1788,GA,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"No popular vote. Instead, vote by state legisl...",Georgia
3,1788,MD,0.7709,0.2291,0.0,0.0,0.0,0.0,0.0,0.0,,Maryland
4,1788,MA,0.9714,0.0286,0.0,0.0,0.0,0.0,0.0,0.0,,Massachusetts


Unnamed: 0,election,federalist,anti_federalist,democratic_republican,democratic,national_republican,whig,republican,third_party,NOTE
0,1788,0.8737,0.1263,0.0,0.0,0.0,0.0,0.0,0.0,George Washington was nonpartisan and received...
1,1792,0.6063,0.0,0.3937,0.0,0.0,0.0,0.0,0.0,George Washington was nonpartisan and received...
2,1796,0.533,0.0,0.467,0.0,0.0,0.0,0.0,0.0,
3,1800,0.394,0.0,0.605,0.0,0.0,0.0,0.0,0.001,
4,1804,0.267,0.0,0.732,0.0,0.0,0.0,0.0,0.001,


## 1. Standardize political parties and vote margins

In [11]:
####################
# Create standard two-party votes across time
####################
# Set up lineage of the two main parties
party_one = ["federalist", "national_republican", "whig", "republican"]
party_two = ["anti_federalist", "democratic_republican", "democratic"]

# National results: create two-party vote shares and margin
national = (
    national_results.copy()
    .rename(columns={"third_party": "party_three"})
)
national["party_one"] = national[party_one].sum(axis=1)
national["party_two"] = national[party_two].sum(axis=1)
national["major_margin"] = national.party_one - national.party_two

national = national[["election", "party_one", "party_two", "major_margin", "party_three"]]
display(national.head())

# State results: create two-party vote shares and margin
state = (
    state_results.copy()
    .rename(columns={"third_party": "party_three"})
)
state["party_one"] = state[party_one].sum(axis=1)
state["party_two"] = state[party_two].sum(axis=1)
state["major_margin"] = state.party_one - state.party_two
state = state[["election", "state", "party_one", "party_two", "major_margin", "party_three"]]


####################
# Calculate relative margin (i.e., partisan lean)
####################
state = state.merge(
    national[["election", "major_margin"]].rename(columns={"major_margin": "national_margin"})
)
state["relative_margin"] = state.major_margin - state.national_margin
display(state.head())

Unnamed: 0,election,party_one,party_two,major_margin,party_three
0,1788,0.8737,0.1263,0.7474,0.0
1,1792,0.6063,0.3937,0.2126,0.0
2,1796,0.533,0.467,0.066,0.0
3,1800,0.394,0.605,-0.211,0.001
4,1804,0.267,0.732,-0.465,0.001


Unnamed: 0,election,state,party_one,party_two,major_margin,party_three,national_margin,relative_margin
0,1788,CT,1.0,0.0,1.0,0.0,0.7474,0.2526
1,1788,DE,1.0,0.0,1.0,0.0,0.7474,0.2526
2,1788,GA,1.0,0.0,1.0,0.0,0.7474,0.2526
3,1788,MD,0.7709,0.2291,0.5418,0.0,0.7474,-0.2056
4,1788,MA,0.9714,0.0286,0.9428,0.0,0.7474,0.1954


In [16]:
state[state.state == "CT"].head()

Unnamed: 0,election,state,party_one,party_two,major_margin,party_three,national_margin,relative_margin
0,1788,CT,1.0,0.0,1.0,0.0,0.7474,0.2526
10,1792,CT,1.0,0.0,1.0,0.0,0.2126,0.7874
25,1796,CT,1.0,0.0,1.0,0.0,0.066,0.934
41,1800,CT,1.0,0.0,1.0,0.0,-0.211,1.211
57,1804,CT,1.0,0.0,1.0,0.0,-0.465,1.465


## 2. Create feature dataset

We will create two feature sets:
1. two-party vote margin only
2. two-party vote margin and third-party vote share

In [165]:
# # Major party features
# election_features_major = (
#     state[["election", "state", "relative_margin"]]
#     .pivot(index="election", columns="state", values="relative_margin")
#     .add_suffix("_margin")
#     .rename_axis(None, axis=1)
# )

# # Third party features
# election_features_third = (
#     state[["election", "state", "party_three"]]
#     .pivot(index="election", columns="state", values="party_three")
#     .add_suffix("_third")
#     .rename_axis(None, axis=1)
# )

# Party one
election_features_major1 = (
    state[["election", "state", "party_one"]]
    .pivot(index="election", columns="state", values="party_one")
    .add_suffix("_party1")
    .rename_axis(None, axis=1)
)

# Third party features
election_features_major2 = (
    state[["election", "state", "party_two"]]
    .pivot(index="election", columns="state", values="party_two")
    .add_suffix("_party2")
    .rename_axis(None, axis=1)
)

# Create feature matrix and row/column labels

# election_features = election_features_major.join(election_features_third)
# election_features = election_features_major
election_features = election_features_major1.join(election_features_major2)

election_years = election_features.index
feature_labels = election_features.columns

In [166]:
# Show feature matrix
pd.DataFrame(election_features, columns=feature_labels, index=election_years).head()

Unnamed: 0_level_0,AK_party1,AL_party1,AR_party1,AZ_party1,CA_party1,CO_party1,CT_party1,DC_party1,DE_party1,FL_party1,...,SD_party2,TN_party2,TX_party2,UT_party2,VA_party2,VT_party2,WA_party2,WI_party2,WV_party2,WY_party2
election,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1788,,,,,,,1.0,,1.0,,...,,,,,0.1202,,,,,
1792,,,,,,,1.0,,1.0,,...,,,,,1.0,0.0,,,,
1796,,,,,,,1.0,,1.0,,...,,1.0,,,0.672,0.0,,,,
1800,,,,,,,1.0,,1.0,,...,,1.0,,,0.7728,0.0,,,,
1804,,,,,,,1.0,,1.0,,...,,1.0,,,0.9886,1.0,,,,


## 3. Cluster elections

In [167]:
####################
# Define custom functions for use in modeling
####################
def silhouette_scorer_factory(step_name: str) -> Callable[[pipeline.Pipeline, np.ndarray], float]:
    """
    Factory function to create a parameterized silhouette_scorer()

    :param step_name:   name of the step containing the clustering model.
    :return:            callable silhouette scorer function.
    """
    def silhouette_scorer(estimator: pipeline.Pipeline, X: np.ndarray) -> float:
        """
        Silhouette scorer for use in grid search.

        :param estimator:   modeling pipeline.
        :param X:           feature matrix.
        :param step_name:   name of the step containing the clustering model.
        :return:            silhouette score of clusters.
        """
        # Apply the transforms of all steps before the clustering step
        for name, step in estimator.named_steps.items():
            if isinstance(step, base.TransformerMixin) and name != step_name:
                X = step.transform(X)
            if name == step_name:
                break
        # Get labels
        labels = estimator.named_steps[step_name].labels_

        # If there's more than one cluster, proceed, otherwise return 0
        labels_no_outliers = [l for l in labels if l != -1]
        if len( np.unique(labels_no_outliers) ) > 1:
            score = metrics.silhouette_score(X, labels)
        else:
            score = 0
        return score
    return silhouette_scorer



### 3.1 K-Means

In [184]:
####################
# Construct modeling pipeline
####################
# Lay out pipeline steps
kmeans_pipeline = pipeline.Pipeline(steps=[
    ("imputer", impute.SimpleImputer(strategy="median")),
    ("scaler", preprocessing.StandardScaler()),
    ("pca", decomposition.PCA()),
    ("kmeans", cluster.KMeans())
])

# Define our parameter grid
kmeans_grid = {
    "pca__n_components": np.arange(2, 50),
    "kmeans__n_clusters": np.arange(2, 20)
}

# Define our grid search object
scorer = silhouette_scorer_factory(step_name="kmeans")
kmeans_search = model_selection.GridSearchCV(
    kmeans_pipeline, 
    kmeans_grid, 
    scoring=scorer, 
    cv=[(slice(None), slice(None))]
)

####################
# Fit K-Means model
####################
# Fit the grid search
kmeans_search.fit(election_features)

# Best parameters and the corresponding score
best_params = kmeans_search.best_params_
best_score = kmeans_search.best_score_

print("Best Parameters:", best_params)
print("Best Silhouette Score:", best_score)


Best Parameters: {'kmeans__n_clusters': np.int64(11), 'pca__n_components': np.int64(2)}
Best Silhouette Score: 0.47565601885295766


  _data = np.array(data, dtype=dtype, copy=copy,


In [185]:
####################
# Show labeled elections
####################
elections_labeled_kmeans = national.assign(cluster = kmeans_search.best_estimator_["kmeans"].labels_)
elections_labeled_kmeans

Unnamed: 0,election,party_one,party_two,major_margin,party_three,cluster
0,1788,0.8737,0.1263,0.7474,0.0,6
1,1792,0.6063,0.3937,0.2126,0.0,10
2,1796,0.533,0.467,0.066,0.0,10
3,1800,0.394,0.605,-0.211,0.001,10
4,1804,0.267,0.732,-0.465,0.001,8
5,1808,0.317,0.65,-0.333,0.033,1
6,1812,0.454,0.523,-0.069,0.023,1
7,1816,0.132,0.729,-0.597,0.139,8
8,1820,0.1894,0.8008,-0.6114,0.0098,5
9,1824,0.0,0.972,-0.972,0.028,5


### 3.2 

In [188]:
####################
# Construct modeling pipeline
####################
# Lay out pipeline steps
model_pipeline = pipeline.Pipeline(steps=[
    ("imputer", impute.SimpleImputer(strategy="median")),
    ("scaler", preprocessing.StandardScaler()),
    ("pca", decomposition.PCA()),
    ("model", cluster.HDBSCAN())
])

# Define our parameter grid
model_grid = {
    "pca__n_components": np.arange(2, 25),
    "model__min_cluster_size": np.arange(2, 8, 1),
    "model__cluster_selection_epsilon": np.arange(0, 2, 0.1)
}

# Define our grid search object
scorer = silhouette_scorer_factory(step_name="model")
model_search = model_selection.GridSearchCV(
    model_pipeline, 
    model_grid, 
    scoring=scorer, 
    cv=[(slice(None), slice(None))]
)

####################
# Fit K-Means model
####################
# Fit the grid search
model_search.fit(election_features)

# Best parameters and the corresponding score
best_params = model_search.best_params_
best_score = model_search.best_score_

print("Best Parameters:", best_params)
print("Best Silhouette Score:", best_score)


Best Parameters: {'model__cluster_selection_epsilon': np.float64(1.6), 'model__min_cluster_size': np.int64(2), 'pca__n_components': np.int64(2)}
Best Silhouette Score: 0.4399706822663427


In [189]:
####################
# Show labeled elections
####################
elections_labeled_cluster = national.assign(cluster = model_search.best_estimator_["model"].labels_)
elections_labeled_cluster

Unnamed: 0,election,party_one,party_two,major_margin,party_three,cluster
0,1788,0.8737,0.1263,0.7474,0.0,8
1,1792,0.6063,0.3937,0.2126,0.0,5
2,1796,0.533,0.467,0.066,0.0,5
3,1800,0.394,0.605,-0.211,0.001,5
4,1804,0.267,0.732,-0.465,0.001,6
5,1808,0.317,0.65,-0.333,0.033,5
6,1812,0.454,0.523,-0.069,0.023,5
7,1816,0.132,0.729,-0.597,0.139,6
8,1820,0.1894,0.8008,-0.6114,0.0098,0
9,1824,0.0,0.972,-0.972,0.028,0
