In [None]:
import re

import numpy as np
import pandas as pd

from numba import njit
from sklearn.manifold import MDS
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder



@njit
def calculate_mentioned_together(ballots, num_candidates, num_ballots, num_ranks):
    mentioned_together = np.zeros((num_candidates, num_candidates))
    for i in range(num_ballots):
        for j in range(num_ranks):
            for k in range(num_ranks):
                if ballots[i, j] <= num_candidates and ballots[i, k] <= num_candidates:
                    mentioned_together[ballots[i, j] - 1, ballots[i, k] - 1] += 1
    return mentioned_together


def perform_rcv_analysis(csv_file, ignore_patterns=None, metric=True):
    if ignore_patterns is None:
        ignore_patterns = ['^WRITE-IN', '^writein', '^write in', '^Write-In', '^Write-in', '^skipped', '^overvote', '^uncommited']

    # Load the CSV file
    df = pd.read_csv(csv_file)

    # Keep only the 'rank' columns
    df = df.filter(regex='^rank')

    # Create a combined regex pattern
    combined_pattern = '|'.join(ignore_patterns)

    # Apply the regex to filter out rows that match the ignore patterns
    for column in df.columns:
        df[column] = df[column].astype(str)
        df[column] = df[column].apply(lambda x: np.nan if re.match(combined_pattern, x) else x)

    df.dropna(inplace=True)

    # Convert the DataFrame to a 2D list
    raw_ballots = df.values.tolist()

    # Create a list of all candidate names
    candidate_names = pd.unique(df.values.ravel())

    # Convert names to integer codes
    label_encoder = LabelEncoder()
    label_encoder.fit(candidate_names)
    ballots = [label_encoder.transform(ballot) for ballot in raw_ballots]
    candidate_names = label_encoder.classes_
    num_candidates = len(candidate_names)

    # Convert ballots to a NumPy array
    ballots = np.array(ballots)

    # Get the number of ballots and ranks
    num_ballots, num_ranks = ballots.shape

    # Count up frequencies of consecutive-pair ballot choices
    counts = np.zeros((num_candidates, num_candidates))
    for i in range(num_ballots):
        for j in range(num_ranks - 1):
            counts[ballots[i, j], ballots[i, j + 1]] += 1

    # Calculate 'mentioned_together'
    mentioned_together = calculate_mentioned_together(ballots, num_candidates, num_ballots, num_ranks)

    # Normalize to frequencies relative to votes cast for the two candidates
    frequencies = counts / mentioned_together

    # Combine frequencies in either direction to create symmetric matrix
    freq_upper_triangle = np.zeros((num_candidates, num_candidates))
    for i in range(num_candidates):
        for j in range(i + 1, num_candidates):
            freq_upper_triangle[i, j] = (frequencies[i, j] + frequencies[j, i]) / 2
            freq_upper_triangle[j, i] = freq_upper_triangle[i, j]

    # Compute 'd' (distance metric)
    min_freq = np.min(freq_upper_triangle[freq_upper_triangle > 0])
    distance = 1 / np.sqrt(freq_upper_triangle)
    distance[np.isnan(distance)] = 2 / min_freq
    distance[np.isinf(distance)] = 2 / min_freq
    np.fill_diagonal(distance, 0)

    # Perform nonmetric MDS for 1 dimension
    mds_1d = MDS(n_components=1, metric=metric, dissimilarity='precomputed', random_state=0)
    mds_1d_values = mds_1d.fit_transform(distance)

    # Perform nonmetric MDS for 2 dimensions
    mds_2d = MDS(n_components=2, metric=metric, dissimilarity='precomputed', random_state=0)
    mds_2d_values = mds_2d.fit_transform(distance)

    # Return the results
    return {
        "election": csv_file,
        "names": candidate_names,
        "1d_mds": mds_1d_values,
        "2d_mds": mds_2d_values,
        "ballots_cast": num_ballots
    }

In [None]:
# Function to iterate over all rows in the metadata DataFrame
def iterate_over_metadata(metadata_df):
    results = []
    base_dir = 'rcv_elections_database'
    for idx, row in metadata_df.iterrows():
        if row['file_exists']:
            csv_file = f"{base_dir}/{row['sub_dir_path']}/{row['filename']}"
            try:
                result = perform_rcv_analysis(csv_file)
                result['race_id'] = row['race_id']
                results.append(result)
            except Exception as e:
                print(f"Error processing file {csv_file}: {e}")
    return results

In [None]:
# Read MatchedMetadata.csv
metadata_df = pd.read_csv('rcv_elections_database/MatchedElections.csv')

# Paths to the directories
dir_paths = ['rcv_elections_database/proportional', 
             'rcv_elections_database/single', 
             'rcv_elections_database/sequential']

# Initialize a list to hold the file names
file_names = []

# Iterate over the directories and get the file names
for dir_path in dir_paths:
    for file_name in os.listdir(dir_path):
        file_names.append(file_name)

# Count how many files from the metadata are in the directories
metadata_df['file_exists'] = metadata_df['filename'].isin(file_names)

# Display the first few rows of the DataFrame
metadata_df.head()

In [None]:
# Count the number of existing and non-existing files
file_exists_counts = metadata_df['file_exists'].value_counts()

file_exists_counts

In [None]:
# Run the function on the metadata DataFrame
results = iterate_over_metadata(metadata_df)

In [None]:
for result in results:
    print(result['2d_mds'])

In [None]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean

# Perform KMeans clustering on each result
for result in results:
    YY = result['1d_mds']
    Y = result['2d_mds']

    # Perform KMeans clustering on 1D MDS data
    kmeans_1d = KMeans(n_clusters=2)
    kmeans_1d.fit(YY.reshape(-1, 1))
    centroid1_1d, centroid2_1d = kmeans_1d.cluster_centers_
    inter_centroid_distance_1d = euclidean(centroid1_1d, centroid2_1d)
    result['kmeans_1d_labels'] = kmeans_1d.labels_

    # Perform KMeans clustering on 2D MDS data
    kmeans_2d = KMeans(n_clusters=2)
    kmeans_2d.fit(Y)
    centroid1_2d, centroid2_2d = kmeans_2d.cluster_centers_
    inter_centroid_distance_2d = euclidean(centroid1_2d, centroid2_2d)
    result['kmeans_2d_labels'] = kmeans_2d.labels_

    # Store the inter-centroid distances in the result dictionary
    result['inter_centroid_distance_1d'] = inter_centroid_distance_1d
    result['inter_centroid_distance_2d'] = inter_centroid_distance_2d

# Sort the results list by the 1D inter-centroid distance
results.sort(key=lambda x: x['inter_centroid_distance_1d'])

In [None]:
# Iterate over each result and create the plots
for result in sorted(results, key=lambda x: -x['ballots_cast']):
    title = result['election']
    names = result['names']
    Ncand = len(names)

    # Check if there are more than 2 candidates
    if Ncand > 2:
        YY = result['1d_mds']
        Y = result['2d_mds']
        kmeans_1d_labels = result['kmeans_1d_labels']
        kmeans_2d_labels = result['kmeans_2d_labels']
        
        print(f'1D Inter-centroid distance for {title}:', result['inter_centroid_distance_1d'])

        # Plot 1D MDS with KMeans clustering
        plt.figure(figsize=(10, 6))
        plt.title(title + ' (1D MDS)')
        plt.scatter(np.zeros_like(YY), YY, c=kmeans_1d_labels)
        for i in range(Ncand):
            plt.text(0.2, YY[i], names[i])
        plt.axis([-1, 1.5, YY.min()*1.2, YY.max()*1.2])
        plt.show()

        print(f'2D Inter-centroid distance for {title}:', result['inter_centroid_distance_2d'])

        # Plot 2D MDS with KMeans clustering
        plt.figure(figsize=(10, 6))
        plt.title(title + ' (2D MDS)')
        plt.scatter(Y[:, 0], Y[:, 1], c=kmeans_2d_labels)
        for i in range(Ncand):
            plt.text(Y[i, 0], Y[i, 1], names[i])
        plt.grid(True)
        plt.show()

In [None]:
import matplotlib.pyplot as plt

# Iterate over each result and create the plots
for result in sorted(results, key=lambda x: x['inter_centroid_distance_1d']):
    title = result['election']
    names = result['names']
    Ncand = len(names)

    # Check if there are more than 2 candidates
    if Ncand > 2:
        YY = result['1d_mds']
        Y = result['2d_mds']
        kmeans_1d_labels = result['kmeans_1d_labels']
        kmeans_2d_labels = result['kmeans_2d_labels']
        
        print(f'1D Inter-centroid distance for {title}:', result['inter_centroid_distance_1d'])

        # Plot 1D MDS with KMeans clustering
        plt.figure(figsize=(10, 6))
        plt.title(title + ' (1D MDS)')
        plt.scatter(np.zeros_like(YY), YY, c=kmeans_1d_labels)
        for i in range(Ncand):
            plt.text(0.2, YY[i], names[i])
        plt.axis([-1, 1.5, YY.min()*1.2, YY.max()*1.2])
        plt.show()

        print(f'2D Inter-centroid distance for {title}:', result['inter_centroid_distance_2d'])

        # Plot 2D MDS with KMeans clustering
        plt.figure(figsize=(10, 6))
        plt.title(title + ' (2D MDS)')
        plt.scatter(Y[:, 0], Y[:, 1], c=kmeans_2d_labels)
        for i in range(Ncand):
            plt.text(Y[i, 0], Y[i, 1], names[i])
        plt.grid(True)
        plt.show()

In [None]:
# Function to iterate over all rows in the metadata DataFrame
def iterate_over_metadata(metadata_df):
    results = []
    base_dir = 'rcv_elections_database'
    for idx, row in metadata_df.iterrows():
        if row['file_exists']:
            csv_file = f"{base_dir}/{row['sub_dir_path']}/{row['filename']}"
            try:
                result = rcv_analysis(csv_file, metric=False)
                result['race_id'] = row['race_id']
                results.append(result)
            except Exception as e:
                print(f"Error processing file {csv_file}: {e}")
    return results

In [None]:
# Run the function on the metadata DataFrame
results_nm = iterate_over_metadata(metadata_df)

In [None]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean

# Perform KMeans clustering on each result
for result in results:
    YY = result['1d_mds']
    Y = result['2d_mds']

    # Perform KMeans clustering on 1D MDS data
    kmeans_1d = KMeans(n_clusters=2)
    kmeans_1d.fit(YY.reshape(-1, 1))
    centroid1_1d, centroid2_1d = kmeans_1d.cluster_centers_
    inter_centroid_distance_1d = euclidean(centroid1_1d, centroid2_1d)
    result['kmeans_1d_labels'] = kmeans_1d.labels_

    # Perform KMeans clustering on 2D MDS data
    kmeans_2d = KMeans(n_clusters=2)
    kmeans_2d.fit(Y)
    centroid1_2d, centroid2_2d = kmeans_2d.cluster_centers_
    inter_centroid_distance_2d = euclidean(centroid1_2d, centroid2_2d)
    result['kmeans_2d_labels'] = kmeans_2d.labels_

    # Store the inter-centroid distances in the result dictionary
    result['inter_centroid_distance_1d'] = inter_centroid_distance_1d
    result['inter_centroid_distance_2d'] = inter_centroid_distance_2d

# Sort the results list by the 1D inter-centroid distance
results.sort(key=lambda x: x['inter_centroid_distance_1d'])

In [None]:
# Iterate over each result and create the plots
for result in sorted(results, key=lambda x: -x['ballots_cast']):
    title = result['election']
    names = result['names']
    Ncand = len(names)

    # Check if there are more than 2 candidates
    if Ncand > 2:
        YY = result['1d_mds']
        Y = result['2d_mds']
        kmeans_1d_labels = result['kmeans_1d_labels']
        kmeans_2d_labels = result['kmeans_2d_labels']
        
        print(f'1D Inter-centroid distance for {title}:', result['inter_centroid_distance_1d'])

        # Plot 1D MDS with KMeans clustering
        plt.figure(figsize=(10, 6))
        plt.title(title + ' (1D MDS)')
        plt.scatter(np.zeros_like(YY), YY, c=kmeans_1d_labels)
        for i in range(Ncand):
            plt.text(0.2, YY[i], names[i])
        plt.axis([-1, 1.5, YY.min()*1.2, YY.max()*1.2])
        plt.show()

        print(f'2D Inter-centroid distance for {title}:', result['inter_centroid_distance_2d'])

        # Plot 2D MDS with KMeans clustering
        plt.figure(figsize=(10, 6))
        plt.title(title + ' (2D MDS)')
        plt.scatter(Y[:, 0], Y[:, 1], c=kmeans_2d_labels)
        for i in range(Ncand):
            plt.text(Y[i, 0], Y[i, 1], names[i])
        plt.grid(True)
        plt.show()

In [None]:
from sklearn.manifold import MDS
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean
import matplotlib.pyplot as plt
import numpy as np

# Filter DataFrame to include only the current election
election_df = cast_vote_data[cast_vote_data['source_file'] == 'Burlington_03072023_CityCouncilCentralDistrict.csv'].copy()

# Convert candidate names to numeric IDs for MDS
le = LabelEncoder()
for col in election_df.columns:
    if col.startswith('rank'):
        # Handle missing values by replacing them with a placeholder
        election_df[col] = election_df[col].fillna('Missing')
        election_df[col] = le.fit_transform(election_df[col])

# Perform MDS
mds = MDS(n_components=2, metric=False, dissimilarity='euclidean')
mds_coordinates = mds.fit_transform(election_df.drop(columns=['source_file', 'type']))

# Perform KMeans clustering
kmeans = KMeans(n_clusters=2)  # assuming two clusters; adjust as needed
kmeans.fit(mds_coordinates)

# Compute the inter-centroid distance
centroid1, centroid2 = kmeans.cluster_centers_
inter_centroid_distance = euclidean(centroid1, centroid2)

# Plot the MDS coordinates and color code them by cluster label
plt.scatter(mds_coordinates[:, 0], mds_coordinates[:, 1], c=kmeans.labels_)
plt.title(f'MDS plot for election Burlington_03072023_CityCouncilCentralDistrict.csv')
plt.show()

print('Inter-centroid distance:', inter_centroid_distance)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean
import matplotlib.pyplot as plt
import numpy as np

# Filter DataFrame to include only the current election
election_df = cast_vote_data[cast_vote_data['source_file'] == 'Burlington_03072023_CityCouncilCentralDistrict.csv'].copy()

# Convert candidate names to numeric IDs for PCA
le = LabelEncoder()
for col in election_df.columns:
    if col.startswith('rank'):
        # Handle missing values by replacing them with a placeholder
        election_df[col] = election_df[col].fillna('Missing')
        election_df[col] = le.fit_transform(election_df[col])

# Perform PCA
pca = PCA(n_components=2)
pca_coordinates = pca.fit_transform(election_df.drop(columns=['source_file', 'type']))

# Perform KMeans clustering
kmeans = KMeans(n_clusters=2)  # assuming two clusters; adjust as needed
kmeans.fit(pca_coordinates)

# Compute the inter-centroid distance
centroid1, centroid2 = kmeans.cluster_centers_
inter_centroid_distance = euclidean(centroid1, centroid2)

# Plot the PCA coordinates and color code them by cluster label
plt.scatter(pca_coordinates[:, 0], pca_coordinates[:, 1], c=kmeans.labels_)
plt.title(f'PCA plot for election Burlington_03072023_CityCouncilCentralDistrict.csv')
plt.show()

print('Inter-centroid distance:', inter_centroid_distance)

In [None]:
from sklearn.manifold import MDS
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from scipy.spatial import distance
import numpy as np

inter_centroid_distances = []

# Number of elections to sample
n_samples = 50

# Randomly sample a subset of the elections
elections_sample = np.random.choice(cast_vote_data['source_file'].unique(), size=n_samples, replace=False)

# Iterate over the sampled elections
for election in elections_sample:
    # Filter DataFrame to include only the current election
    election_df = cast_vote_data[cast_vote_data['source_file'] == election].copy()

    # Convert candidate names to numeric IDs for MDS
    le = LabelEncoder()
    for col in election_df.columns:
        if col.startswith('rank'):
            # Handle missing values by replacing them with a placeholder
            election_df[col] = election_df[col].fillna('Missing')
            election_df[col] = le.fit_transform(election_df[col])

    # Perform MDS
    mds = MDS(n_components=2)
    mds_coordinates = mds.fit_transform(election_df.drop(columns=['source_file', 'type']))

    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=2)  # assuming two clusters; adjust as needed
    kmeans.fit(pca_coordinates)

    # Compute the inter-centroid distance and store it
    centroid1, centroid2 = kmeans.cluster_centers_
    inter_centroid_distance = distance.euclidean(centroid1, centroid2)
    inter_centroid_distances.append(inter_centroid_distance)

    # Plot the MDS coordinates and color code them by cluster label
    plt.scatter(mds_coordinates[:, 0], mds_coordinates[:, 1], c=kmeans.labels_)
    plt.title(f'MDS plot for election {election}')
    plt.show()

# List of inter-centroid distances for the sampled elections, convert it to a pandas Series for easier manipulation
inter_centroid_distances = pd.Series(inter_centroid_distances, index=elections_sample)

# Reindex the inter_centroid_distances Series to match the order of the ballots_per_election Series
inter_centroid_distances = inter_centroid_distances.reindex(ballots_per_election.index)

# Plot the inter-centroid distances
inter_centroid_distances.plot(kind='barh', figsize=(10, 15))
plt.xlabel('Inter-centroid distance')
plt.ylabel('Election')
plt.title('Inter-centroid distances for each election')
plt.show()

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from scipy.spatial import distance
import numpy as np

inter_centroid_distances = []

# Iterate over all elections
for election in cast_vote_data['source_file'].unique():
    # Filter DataFrame to include only the current election
    election_df = cast_vote_data[cast_vote_data['source_file'] == election].copy()

    # Convert candidate names to numeric IDs for PCA
    le = LabelEncoder()
    for col in election_df.columns:
        if col.startswith('rank'):
            # Handle missing values by replacing them with a placeholder
            election_df[col] = election_df[col].fillna('Missing')
            election_df[col] = le.fit_transform(election_df[col])

    # Perform PCA
    pca = PCA(n_components=2)
    pca_coordinates = pca.fit_transform(election_df.drop(columns=['source_file', 'type']))

    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=2)  # assuming two clusters; adjust as needed
    kmeans.fit(pca_coordinates)

    # Compute the inter-centroid distance and store it
    centroid1, centroid2 = kmeans.cluster_centers_
    inter_centroid_distance = distance.euclidean(centroid1, centroid2)
    inter_centroid_distances.append((election, inter_centroid_distance))

# Create a DataFrame with inter-centroid distances for all elections
inter_centroid_df = pd.DataFrame(inter_centroid_distances, columns=['Election', 'Inter-centroid Distance'])
inter_centroid_df.set_index('Election', inplace=True)

# Sort the DataFrame by inter-centroid distance in descending order
sorted_inter_centroid_df = inter_centroid_df.sort_values(by='Inter-centroid Distance', ascending=False)

# Print the DataFrame
print(sorted_inter_centroid_df)

# Plot the inter-centroid distances
sorted_inter_centroid_df.plot(kind='barh', figsize=(10, 15), legend=False)
plt.xlabel('Inter-centroid Distance')
plt.ylabel('Election')
plt.title('Inter-centroid Distances for Each Election')
plt.show()

In [None]:
# Print each row in the sorted DataFrame one by one
for idx, row in sorted_inter_centroid_df.iterrows():
    print(f"Election: {idx}, Inter-centroid Distance: {row['Inter-centroid Distance']}")

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean
import matplotlib.pyplot as plt
import numpy as np

# Filter DataFrame to include only the current election
election_df = cast_vote_data[cast_vote_data['source_file'] == 'TakomaPark_11082022_Mayor.csv'].copy()

# Convert candidate names to numeric IDs for PCA
le = LabelEncoder()
for col in election_df.columns:
    if col.startswith('rank'):
        # Handle missing values by replacing them with a placeholder
        election_df[col] = election_df[col].fillna('Missing')
        election_df[col] = le.fit_transform(election_df[col])

# Perform PCA
pca = PCA(n_components=2)
pca_coordinates = pca.fit_transform(election_df.drop(columns=['source_file', 'type']))

# Perform KMeans clustering
kmeans = KMeans(n_clusters=2)  # assuming two clusters; adjust as needed
kmeans.fit(pca_coordinates)

# Compute the inter-centroid distance
centroid1, centroid2 = kmeans.cluster_centers_
inter_centroid_distance = euclidean(centroid1, centroid2)

# Plot the PCA coordinates and color code them by cluster label
plt.scatter(pca_coordinates[:, 0], pca_coordinates[:, 1], c=kmeans.labels_)
plt.title(f'PCA plot for election TakomaPark_11082022_Mayor.csv')
plt.show()

print('Inter-centroid distance:', inter_centroid_distance)

In [None]:
from sklearn.manifold import MDS
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean
import matplotlib.pyplot as plt
import numpy as np

# Filter DataFrame to include only the current election
election_df = cast_vote_data[cast_vote_data['source_file'] == 'TakomaPark_11082022_Mayor.csv'].copy()

# Convert candidate names to numeric IDs for PCA
le = LabelEncoder()
for col in election_df.columns:
    if col.startswith('rank'):
        # Handle missing values by replacing them with a placeholder
        election_df[col] = election_df[col].fillna('Missing')
        election_df[col] = le.fit_transform(election_df[col])

# Perform MDS
mds = MDS(n_components=2)
mds_coordinates = mds.fit_transform(election_df.drop(columns=['source_file', 'type']))

# Perform KMeans clustering
kmeans = KMeans(n_clusters=2)  # assuming two clusters; adjust as needed
kmeans.fit(mds_coordinates)

# Compute the inter-centroid distance
centroid1, centroid2 = kmeans.cluster_centers_
inter_centroid_distance = euclidean(centroid1, centroid2)

# Plot the MDS coordinates and color code them by cluster label
plt.scatter(mds_coordinates[:, 0], mds_coordinates[:, 1], c=kmeans.labels_)
plt.title(f'MDS plot for election TakomaPark_11082022_Mayor.csv')
plt.show()

print('Inter-centroid distance:', inter_centroid_distance)

In [None]:
import matplotlib.pyplot as plt

# Count the number of ballots cast for each election
ballots_per_election = cast_vote_data['source_file'].value_counts().sort_values()

# Plot the data
plt.figure(figsize=(10, 15))
ballots_per_election.plot(kind='barh')
plt.xlabel('Number of ballots cast')
plt.ylabel('Election')
plt.title('Number of ballots cast in each election')
plt.show()

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from scipy.spatial import distance
import numpy as np

inter_centroid_distances = []

# Number of elections to sample
n_samples = 50

# Randomly sample a subset of the elections
elections_sample = np.random.choice(cast_vote_data['source_file'].unique(), size=n_samples, replace=False)

# Iterate over the sampled elections
for election in elections_sample:
    # Filter DataFrame to include only the current election
    election_df = cast_vote_data[cast_vote_data['source_file'] == election].copy()

    # Convert candidate names to numeric IDs for PCA
    le = LabelEncoder()
    for col in election_df.columns:
        if col.startswith('rank'):
            # Handle missing values by replacing them with a placeholder
            election_df[col] = election_df[col].fillna('Missing')
            election_df[col] = le.fit_transform(election_df[col])

    # Perform PCA
    pca = PCA(n_components=2)
    pca_coordinates = pca.fit_transform(election_df.drop(columns=['source_file', 'type']))

    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=2)  # assuming two clusters; adjust as needed
    kmeans.fit(pca_coordinates)

    # Compute the inter-centroid distance and store it
    centroid1, centroid2 = kmeans.cluster_centers_
    inter_centroid_distance = distance.euclidean(centroid1, centroid2)
    inter_centroid_distances.append(inter_centroid_distance)

    # Plot the PCA coordinates and color code them by cluster label
    plt.scatter(pca_coordinates[:, 0], pca_coordinates[:, 1], c=kmeans.labels_)
    plt.title(f'PCA plot for election {election}')
    plt.show()

# List of inter-centroid distances for the sampled elections, convert it to a pandas Series for easier manipulation
inter_centroid_distances = pd.Series(inter_centroid_distances, index=elections_sample)

# Reindex the inter_centroid_distances Series to match the order of the ballots_per_election Series
inter_centroid_distances = inter_centroid_distances.reindex(ballots_per_election.index)

# Plot the inter-centroid distances
inter_centroid_distances.plot(kind='barh', figsize=(10, 15))
plt.xlabel('Inter-centroid distance')
plt.ylabel('Election')
plt.title('Inter-centroid distances for each election')
plt.show()