# Cluster US Presidential elections 1788-2024
Here, we will use machine learning to re-examine the party system in the United States through the lens of presidential election results across states.

In [1]:
import numpy as np
import pandas as pd
from sklearn import base, pipeline, impute, preprocessing, decomposition, cluster, metrics, model_selection
from typing import Any, Callable

import seaborn as sns

## 0. Load election data

In [2]:
####################
# Load election data
####################
# Load data
state_results = pd.read_csv("../data/state_election_history.csv")
national_results = pd.read_csv("../data/national_election_history.csv")

# Prep state name as abbreviation
state_abbrev_map = {
        "Alaska": "AK",
        "Alabama": "AL",
        "Arkansas": "AR",
        "Arizona": "AZ",
        "California": "CA",
        "Colorado": "CO",
        "Connecticut": "CT",
        "District of Columbia": "DC",
        "Delaware": "DE",
        "Florida": "FL",
        "Georgia": "GA",
        "Hawaii": "HI",
        "Iowa": "IA",
        "Idaho": "ID",
        "Illinois": "IL",
        "Indiana": "IN",
        "Kansas": "KS",
        "Kentucky": "KY",
        "Louisiana": "LA",
        "Massachusetts": "MA",
        "Maryland": "MD",
        "Maine": "ME",
        "Michigan": "MI",
        "Minnesota": "MN",
        "Missouri": "MO",
        "Mississippi": "MS",
        "Montana": "MT",
        "North Carolina": "NC",
        "North Dakota": "ND",
        "Nebraska": "NE",
        "New Hampshire": "NH",
        "New Jersey": "NJ",
        "New Mexico": "NM",
        "Nevada": "NV",
        "New York": "NY",
        "Ohio": "OH",
        "Oklahoma": "OK",
        "Oregon": "OR",
        "Pennsylvania": "PA",
        "Rhode Island": "RI",
        "South Carolina": "SC",
        "South Dakota": "SD",
        "Tennessee": "TN",
        "Texas": "TX",
        "Utah": "UT",
        "Virginia": "VA",
        "Vermont": "VT",
        "Washington": "WA",
        "Wisconsin": "WI",
        "West Virginia": "WV",
        "Wyoming": "WY",
        "Puerto Rico": "PR",
        "Virigin Islands": "VI"
    }
state_results["state_full"] = state_results["state"]
state_results["state"] = state_results["state"].map(state_abbrev_map)

# Drop 2024 results for now
national_results = national_results[national_results.election != 2024]

display(state_results.head())
display(national_results.head())

Unnamed: 0,election,state,federalist,anti_federalist,democratic_republican,democratic,national_republican,whig,republican,third_party,NOTE,state_full
0,1788,CT,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"No popular vote. Instead, vote by state legisl...",Connecticut
1,1788,DE,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Delaware
2,1788,GA,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"No popular vote. Instead, vote by state legisl...",Georgia
3,1788,MD,0.7709,0.2291,0.0,0.0,0.0,0.0,0.0,0.0,,Maryland
4,1788,MA,0.9714,0.0286,0.0,0.0,0.0,0.0,0.0,0.0,,Massachusetts


Unnamed: 0,election,federalist,anti_federalist,democratic_republican,democratic,national_republican,whig,republican,third_party,NOTE
0,1788,0.8737,0.1263,0.0,0.0,0.0,0.0,0.0,0.0,George Washington was nonpartisan and received...
1,1792,0.6063,0.0,0.3937,0.0,0.0,0.0,0.0,0.0,George Washington was nonpartisan and received...
2,1796,0.533,0.0,0.467,0.0,0.0,0.0,0.0,0.0,
3,1800,0.394,0.0,0.605,0.0,0.0,0.0,0.0,0.001,
4,1804,0.267,0.0,0.732,0.0,0.0,0.0,0.0,0.001,


## 1. Standardize political parties and vote margins

In [3]:
####################
# Create standard two-party votes across time
####################
# Set up lineage of the two main parties
party_one = ["federalist", "national_republican", "whig", "republican"]
party_two = ["anti_federalist", "democratic_republican", "democratic"]

# National results: create two-party vote shares and margin
national = (
    national_results.copy()
    .rename(columns={"third_party": "party_three"})
)
national["party_one"] = national[party_one].sum(axis=1)
national["party_two"] = national[party_two].sum(axis=1)
national["major_margin"] = national.party_one - national.party_two

national = national[national.election >= 1856]

national = national[["election", "party_one", "party_two", "major_margin", "party_three"]]
display(national.head())

# State results: create two-party vote shares and margin
state = (
    state_results.copy()
    .rename(columns={"third_party": "party_three"})
)
state["party_one"] = state[party_one].sum(axis=1)
state["party_two"] = state[party_two].sum(axis=1)
state["major_margin"] = state.party_one - state.party_two
state = state[["election", "state", "party_one", "party_two", "major_margin", "party_three"]]


####################
# Calculate relative margin (i.e., partisan lean)
####################
state = state.merge(
    national[["election", "major_margin"]].rename(columns={"major_margin": "national_margin"})
)
state["relative_margin"] = state.major_margin - state.national_margin
display(state.head())

Unnamed: 0,election,party_one,party_two,major_margin,party_three
17,1856,0.331,0.453,-0.122,0.216
18,1860,0.398,0.476,-0.078,0.126
19,1864,0.551,0.449,0.102,0.0
20,1868,0.527,0.473,0.054,0.0
21,1872,0.556,0.438,0.118,0.006


Unnamed: 0,election,state,party_one,party_two,major_margin,party_three,national_margin,relative_margin
0,1856,AL,0.0,0.6208,-0.6208,0.3792,-0.122,-0.4988
1,1856,AR,0.0,0.6712,-0.6712,0.3288,-0.122,-0.5492
2,1856,CA,0.1878,0.4838,-0.296,0.3284,-0.122,-0.174
3,1856,CT,0.5318,0.4357,0.0961,0.0325,-0.122,0.2181
4,1856,DE,0.0212,0.5483,-0.5271,0.4305,-0.122,-0.4051


## 2. Create feature dataset

We will create two feature sets:
1. two-party vote margin only
2. two-party vote margin and third-party vote share

In [29]:
# Major party features
election_features_major = (
    state[["election", "state", "relative_margin"]]
    .pivot(index="election", columns="state", values="relative_margin")
    .add_suffix("_margin")
    .rename_axis(None, axis=1)
)

# Third party features
election_features_third = (
    state[["election", "state", "party_three"]]
    .pivot(index="election", columns="state", values="party_three")
    .add_suffix("_third")
    .rename_axis(None, axis=1)
)

# Party one features
election_features_major1 = (
    state[["election", "state", "party_one"]]
    .pivot(index="election", columns="state", values="party_one")
    .add_suffix("_party1")
    .rename_axis(None, axis=1)
)

# Party two features
election_features_major2 = (
    state[["election", "state", "party_two"]]
    .pivot(index="election", columns="state", values="party_two")
    .add_suffix("_party2")
    .rename_axis(None, axis=1)
)

# Create feature matrix and row/column labels
election_features = election_features_major
# election_features = election_features_major.join(election_features_third)
# election_features = (
#     election_features_major1
#     .join(election_features_major2)
#     .join(election_features_third)
# )

election_years = election_features.index
feature_labels = election_features.columns

In [30]:
# Show feature matrix
pd.DataFrame(election_features, columns=feature_labels, index=election_years).head()

Unnamed: 0_level_0,AK_margin,AL_margin,AR_margin,AZ_margin,CA_margin,CO_margin,CT_margin,DC_margin,DE_margin,FL_margin,...,SD_margin,TN_margin,TX_margin,UT_margin,VA_margin,VT_margin,WA_margin,WI_margin,WV_margin,WY_margin
election,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1856,,-0.4988,-0.5492,,-0.174,,0.2181,,-0.4051,-0.4461,...,,-0.3998,-0.5439,,-0.4776,0.6932,,0.2328,,
1860,,-0.6131,-0.5515,,-0.1995,,0.1965,,-0.2063,-0.5613,...,,-0.4447,-0.677,,-0.4532,0.6006,,0.2108,,
1864,,,,,0.0707,,-0.0744,,-0.1382,,...,,0.6122,,,,0.42,,0.0156,0.2628,
1868,,-0.029,0.0196,,-0.0492,,-0.0242,,-0.234,0.5786,...,,0.3146,,,,0.5174,,0.071,0.1226,
1872,,-0.0542,-0.0746,,0.0146,,-0.0698,,-0.0756,-0.0476,...,,-0.1612,-0.2816,,-0.1082,0.4587,,-0.0217,-0.0734,


## 3. Determine PCA dimensions

In [39]:
####################
# Construct modeling pipeline
####################
# Lay out data preparation pipeline steps
data_pipeline = pipeline.Pipeline(steps=[
    ("imputer", impute.SimpleImputer(strategy="median")),
    ("scaler", preprocessing.StandardScaler()),
    ("pca", decomposition.PCA())
])


####################
# Determine minimum amount of components
####################
# Process data
data_pipeline.fit(election_features)

# Determine minimum components
MINIMUM_VARIANCE_EXPLAINED = 0.7
cumulative_explained_variance = data_pipeline["pca"].explained_variance_ratio_.cumsum()
acceptable_components = np.where(cumulative_explained_variance > MINIMUM_VARIANCE_EXPLAINED)[0] + 1 #account for zero indexing
minimum_components = min(acceptable_components)

print(f"To get at least {MINIMUM_VARIANCE_EXPLAINED * 100}% of data variance explained, we will use a minimum of {minimum_components} components in PCA.")

To get at least 70.0% of data variance explained, we will use a minimum of 5 components in PCA.


## 4. Cluster elections

In [40]:
####################
# Define custom functions for use in modeling
####################
def silhouette_scorer_factory(step_name: str) -> Callable[[pipeline.Pipeline, np.ndarray], float]:
    """
    Factory function to create a parameterized silhouette_scorer()

    :param step_name:   name of the step containing the clustering model.
    :return:            callable silhouette scorer function.
    """
    def silhouette_scorer(estimator: pipeline.Pipeline, X: np.ndarray) -> float:
        """
        Silhouette scorer for use in grid search.

        :param estimator:   modeling pipeline.
        :param X:           feature matrix.
        :param step_name:   name of the step containing the clustering model.
        :return:            silhouette score of clusters.
        """
        # Apply the transforms of all steps before the clustering step
        for name, step in estimator.named_steps.items():
            if isinstance(step, base.TransformerMixin) and name != step_name:
                X = step.transform(X)
            if name == step_name:
                break
        # Get labels
        labels = estimator.named_steps[step_name].labels_

        # If there's more than one cluster, proceed, otherwise return 0
        labels_no_outliers = [l for l in labels if l != -1]
        if len( np.unique(labels_no_outliers) ) > 1:
            score = metrics.silhouette_score(X, labels)
        else:
            score = 0
        return score
    return silhouette_scorer



### 3.1 K-Means

In [41]:
####################
# Construct modeling pipeline
####################
# Lay out pipeline steps
kmeans_pipeline = pipeline.Pipeline(steps=[
    ("imputer", impute.SimpleImputer(strategy="median")),
    ("scaler", preprocessing.StandardScaler()),
    ("pca", decomposition.PCA()),
    ("kmeans", cluster.KMeans(max_iter=1000))
])

# Define our parameter grid
kmeans_grid = {
    "pca__n_components": np.arange(minimum_components, 25),
    "kmeans__n_clusters": np.arange(2, 20)
}

# Define our grid search object
scorer = silhouette_scorer_factory(step_name="kmeans")
kmeans_search = model_selection.GridSearchCV(
    kmeans_pipeline, 
    kmeans_grid, 
    scoring=scorer, 
    cv=[(slice(None), slice(None))]
)

####################
# Fit K-Means model
####################
# Fit the grid search
kmeans_search.fit(election_features)

# Best parameters and the corresponding score
best_params = kmeans_search.best_params_
best_score = kmeans_search.best_score_

print("Best Parameters:", best_params)
print("Best Silhouette Score:", best_score)


Best Parameters: {'kmeans__n_clusters': np.int64(9), 'pca__n_components': np.int64(5)}
Best Silhouette Score: 0.40803355234796646


In [42]:
####################
# Show labeled elections
####################
elections_labeled_kmeans = national.assign(cluster = kmeans_search.best_estimator_["kmeans"].labels_)
elections_labeled_kmeans

Unnamed: 0,election,party_one,party_two,major_margin,party_three,cluster
17,1856,0.331,0.453,-0.122,0.216,5
18,1860,0.398,0.476,-0.078,0.126,5
19,1864,0.551,0.449,0.102,0.0,2
20,1868,0.527,0.473,0.054,0.0,2
21,1872,0.556,0.438,0.118,0.006,6
22,1876,0.479,0.509,-0.03,0.012,6
23,1880,0.4832,0.4821,0.0011,0.0347,6
24,1884,0.483,0.488,-0.005,0.029,6
25,1888,0.478,0.486,-0.008,0.036,6
26,1892,0.43,0.46,-0.03,0.11,6


### 3.2 

In [352]:
####################
# Construct modeling pipeline
####################
# Lay out pipeline steps
model_pipeline = pipeline.Pipeline(steps=[
    ("imputer", impute.SimpleImputer(strategy="median")),
    ("scaler", preprocessing.StandardScaler()),
    ("pca", decomposition.PCA()),
    ("model", cluster.DBSCAN(metric="cosine"))
])

# Define our parameter grid
model_grid = {
    "pca__n_components": np.arange(minimum_components, 50),
    "model__min_samples": np.arange(2, 6),
    "model__eps": np.arange(0.1, 2, 0.1)
}

# Define our grid search object
scorer = silhouette_scorer_factory(step_name="model")
model_search = model_selection.GridSearchCV(
    model_pipeline, 
    model_grid, 
    scoring=scorer, 
    cv=[(slice(None), slice(None))]
)

####################
# Fit K-Means model
####################
# Fit the grid search
model_search.fit(election_features)

# Best parameters and the corresponding score
best_params = model_search.best_params_
best_score = model_search.best_score_

print("Best Parameters:", best_params)
print("Best Silhouette Score:", best_score)


Best Parameters: {'model__eps': np.float64(0.30000000000000004), 'model__min_samples': np.int64(2), 'pca__n_components': np.int64(3)}
Best Silhouette Score: 0.3994020672666113


532 fits failed out of a total of 3572.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
76 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/ChrisTokita/Documents/Research/Independent Research/us-party-system-clustering/venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 886, in _fit_and_score
    estimator.fit(X_train, **fit_params)
  File "/Users/ChrisTokita/Documents/Research/Independent Research/us-party-system-clustering/venv/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ChrisTokita/Documents/Research/Independent Research/us-party-s

In [353]:
####################
# Show labeled elections
####################
elections_labeled_cluster = national.assign(cluster = model_search.best_estimator_["model"].labels_)
elections_labeled_cluster

Unnamed: 0,election,party_one,party_two,major_margin,party_three,cluster
17,1856,0.331,0.453,-0.122,0.216,0
18,1860,0.398,0.476,-0.078,0.126,0
19,1864,0.551,0.449,0.102,0.0,1
20,1868,0.527,0.473,0.054,0.0,1
21,1872,0.556,0.438,0.118,0.006,0
22,1876,0.479,0.509,-0.03,0.012,0
23,1880,0.4832,0.4821,0.0011,0.0347,0
24,1884,0.483,0.488,-0.005,0.029,0
25,1888,0.478,0.486,-0.008,0.036,0
26,1892,0.43,0.46,-0.03,0.11,0


## 5. Save results

In [44]:
####################
# Save K-Means Election
####################
SAVE_KMEANS_RESULTS = True

if SAVE_KMEANS_RESULTS:
    elections_labeled_kmeans.to_csv("../data_derived/kmeans_clusters.csv", index=False)