In [10]:
import pathlib
from sklearn.decomposition import PCA
from tslearn.clustering import KShape
from tslearn.clustering import TimeSeriesKMeans
from sklearn.cluster import KMeans
from src.data import correlations
from src.plots import plotting_utils
import os
from plots import plot_state_map


data_dir_path = pathlib.Path("../../data")
models_path = os.path.abspath('../models')
utils_path = os.path.abspath('../utils')

In [11]:
%load_ext autoreload
%autoreload 2

### Loading data 

`voting_data_merged:` Voting trends for election years 2004-2016. Each year comprises of percentage of votes for democrates/republicans for 3 different age groups: 18-29, 30-44, 45-65.

In [12]:
voting_data_merged = correlations.merge_voting_by_years(data_dir_path)

### Interpolating data for voting 

Since we have available only the voting data for election years, in order to have more fine-grained data, we interpolate the voting trends for the years in between, to obtain the trends for each year 2004-2016.

In [13]:
interpolate_data = correlations.interpolate_votes(voting_data_merged)

### Clustering on original voting data

Clustering states based on their voting trends throughout the election years.

In [14]:
kmeans = KMeans(n_clusters=3, random_state=42, n_init="auto").fit(voting_data_merged)
states = voting_data_merged.index
cluster_labels = kmeans.labels_
state_clusters = dict(zip(states, cluster_labels))
print(state_clusters)

{'New York': 2, 'California': 2, 'New Hampshire': 0, 'Wisconsin': 0, 'Iowa': 0, 'Nevada': 0, 'Pennsylvania': 0, 'Virginia': 0, 'Ohio': 0, 'Florida': 0, 'North Carolina': 2, 'Arizona': 0, 'Indiana': 1, 'Georgia': 1, 'South Carolina': 1, 'Kentucky': 1, 'Texas': 1}


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [None]:
# Visualization of clustering on a lower dimension
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(voting_data_merged)

plotting_utils.plot_clustering(reduced_data[:, 0], reduced_data[:, 1], cluster_labels, states)

### Performing clustering on interpolated data

First, we need to prepare the data for time-series clustering. We are trying a few different clustering methods: 
- K-Shape
- K-Means
- Time Series K-Means using Dynamic Time Warping metric

In [16]:
# Drop all columns containing 'republican' in their names since they don't add more information 
interpolate_data_filt = interpolate_data.loc[:, ~interpolate_data.columns.str.contains('republican')]

# This leaves us with 17x39 dataframe: For each state we have a time series for 13 years for 3 different age groups

# Apply K-Shape
kshape = KShape(n_clusters=3, random_state=42)
kshape_labels = kshape.fit_predict(interpolate_data_filt)  
print("Cluster Labels K-Shape:", kshape_labels)

# Apply K-Means 
kmeans = KMeans(n_clusters=3, random_state=42, n_init="auto").fit(interpolate_data_filt)
states = interpolate_data_filt.index
kmeans_labels = kmeans.labels_
state_clusters = dict(zip(states, kmeans_labels))
print("Cluster Labels K-Means:", kmeans_labels)

# Apply Multivariate DTW-based clustering
tskmeans = TimeSeriesKMeans(n_clusters=3, metric="dtw", n_jobs=-1, random_state=42)
tskmeans_labels = tskmeans.fit_predict(interpolate_data_filt)  
state_clusters = dict(zip(states, tskmeans_labels))
print("Cluster Labels Time Series K-Means:", tskmeans_labels)


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Cluster Labels K-Shape: [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
Cluster Labels K-Means: [2 2 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1]
Cluster Labels Time Series K-Means: [0 0 0 0 0 0 0 0 0 0 2 0 1 0 1 1 1]


### For further analysis we choose to proceed with results from K-Means

In [17]:
# Visualization of clustering on a lower dimension
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(interpolate_data_filt)

fig = plotting_utils.plot_clustering(reduced_data[:, 0], reduced_data[:, 1], kmeans_labels, states)
fig.write_html('clustering_states.html')

### Plotting the voting trends throughout the years

In [18]:
years = range(2004,2017)
election_years = [2004, 2008, 2012, 2016]
age_groups = ["18_29", "30_44", "45_64"]
reshaped_data = correlations.reshape_data(voting_data_merged, election_years, age_groups)

### For election years

In [19]:
plot_state_map.plot_vote_distribution(reshaped_data, age_groups)

### For all the years (interpolated data)

In [29]:
interpolated_data_reshaped = correlations.reshape_data(interpolate_data, years, age_groups)
plot_state_map.plot_vote_distribution(interpolated_data_reshaped, age_groups)