In [3]:
import os
import pandas as pd

# Define the path to the main directory
main_dir_path = "rcv_elections_database/"

# Define the subdirectories
subdirectories = [
    "proportional",
    "sequential",
    "single"
]

# Create a list to store all dataframes
dfs = []

# Loop over all subdirectories
for subdir in subdirectories:
    subdir_path = os.path.join(main_dir_path, subdir)

    # Loop over all the CSV files in the subdirectory
    for filename in os.listdir(subdir_path):
        if filename.endswith(".csv"):
            # Read the CSV file into a DataFrame, adding the low_memory=False option
            df = pd.read_csv(os.path.join(subdir_path, filename), low_memory=False)
            
            # Add new columns 'source_file' and 'type' containing the name of the source CSV file and the type of the election, respectively
            df['source_file'] = filename.strip('.csv')
            df['type'] = subdir  # This will indicate the type of the election

            # Keep only columns that start with 'rank' or are 'source_file' or 'type'
            df = df[[col for col in df.columns if col.startswith('rank') or col in ['source_file', 'type']]]

            # Append the DataFrame to the list of dataframes
            dfs.append(df)

# Concatenate all dataframes in the list into a single DataFrame
cast_vote_data = pd.concat(dfs, ignore_index=True)

# Reorder the columns to move the rank columns to the end
cols_to_order = ['source_file', 'type']
new_columns = cols_to_order + (cast_vote_data.columns.drop(cols_to_order).tolist())
cast_vote_data = cast_vote_data[new_columns]

In [None]:
# Print the first few rows of the consolidated DataFrame
print(cast_vote_data.head())

In [None]:
import pandas as pd
from fuzzywuzzy import fuzz, process

def match_file_to_metadata(filename, metadata, confirmed_matches, top_n=5):
    # Split the filename at the underscores and take the first two elements
    key_to_match = '_'.join(filename.split('_')[:2])

    # Check if this filename has already been confirmed
    if key_to_match in confirmed_matches:
        #print(f'Match for {filename} already found in confirmed matches.')
        return confirmed_matches[key_to_match]

    # Replace NaN values in 'RaceID' with an empty string
    metadata['RaceID'].fillna('', inplace=True)

    # Try exact matching first
    exact_match = metadata[metadata['RaceID'].str.startswith(key_to_match)]

    if not exact_match.empty:
        print(f'Exact match found for {filename}.')
        if len(exact_match) > 1:
            print(f'Multiple exact matches found for {filename}, applying fuzzy matching.')
            # If there are multiple exact matches, use fuzzy matching on these
            top_matches = process.extract(filename, exact_match['RaceID'], scorer=fuzz.token_sort_ratio, limit=top_n)

            for match in top_matches:
                print(f'Fuzzy match found for {filename} with score {match[1]}')
                confirmed_matches[key_to_match] = exact_match[exact_match['RaceID'] == match[0]].iloc[0]
        else:
            # If there's only one exact match, select it
            print(f'Single exact match found for {filename}')
            confirmed_matches[key_to_match] = exact_match.iloc[0]
    else:
        # If no exact match, use fuzzy matching
        print(f'No exact match found for {filename}, applying fuzzy matching.')
        top_matches = process.extract(key_to_match, metadata['RaceID'], scorer=fuzz.token_sort_ratio, limit=top_n)

        # Just pick the top match without user confirmation
        print(f'Top fuzzy match selected for {filename} with score {top_matches[0][1]}')
        confirmed_matches[key_to_match] = metadata[metadata['RaceID'] == top_matches[0][0]].iloc[0]

    # Save confirmed matches to CSV
    confirmed_matches_df = pd.DataFrame(list(confirmed_matches.items()), columns=['source_file', 'matched_metadata'])
    confirmed_matches_df.to_csv(f'{main_dir_path}/MatchedMetadata.csv', index=False)

    print(f'Match for {filename} saved to CSV.')
    return confirmed_matches[key_to_match]

# Separate metadata frames by type
metadata_single = pd.read_csv("rcv_elections_database/SingleWinnerRCV.csv")
metadata_sequential = pd.read_csv("rcv_elections_database/SequentialRCV.csv")
metadata_proportional = pd.read_csv("rcv_elections_database/ProportionalRCV.csv")

# Create a dictionary to store confirmed matches
confirmed_matches = {}

# If a match is found, we add the state and location to the data frame.
for index, row in cast_vote_data.iterrows():
    # We use the type column to select the corresponding metadata rows
    if row['type'] == "single":
        metadata_type = metadata_single
    elif row['type'] == "sequential":
        metadata_type = metadata_sequential
    elif row['type'] == "proportional":
        metadata_type = metadata_proportional
    else:
        continue  # Skip if type is not recognized

    matched_metadata = match_file_to_metadata(row['source_file'], metadata_type, confirmed_matches)

    if matched_metadata is not None:
        cast_vote_data.loc[index, 'State'] = matched_metadata['State']
        cast_vote_data.loc[index, 'Location'] = matched_metadata['Jurisdiction']
        cast_vote_data.loc[index, 'Year'] = matched_metadata['Year']

# Reordering the columns
cols_to_order = ['source_file', 'type', 'State', 'Location', 'Year']
new_columns = cols_to_order + (cast_vote_data.columns.drop(cols_to_order).tolist())
cast_vote_data = cast_vote_data[new_columns]

In [None]:
# Print the first few rows of the consolidated DataFrame
print(cast_vote_data.head())

In [None]:
# Print the first few rows of the matched metadata
confirmed_matches_csv= f'{main_dir_path}/MatchedMetadata.csv'
confirmed_matches_df = pd.read_csv(confirmed_matches_csv)
print(confirmed_matches_df.head())

In [None]:
import pandas as pd

def use_saved_matches(filename, confirmed_matches_csv=f'{main_dir_path}/MatchedMetadata.csv'):
    # Load the confirmed matches from the CSV file
    confirmed_matches_df = pd.read_csv(confirmed_matches_csv)
    confirmed_matches = confirmed_matches_df.set_index('source_file')['matched_metadata'].to_dict()

    # Check if this filename has a confirmed match
    key_to_match = '_'.join(filename.split('_')[:2])
    return confirmed_matches.get(key_to_match, None)

def consolidate_data_using_saved_matches(cast_vote_data, confirmed_matches_csv=f'{main_dir_path}/MatchedMetadata.csv'):
    # Load the confirmed matches from the CSV file
    confirmed_matches_df = pd.read_csv(confirmed_matches_csv)
    confirmed_matches = confirmed_matches_df.set_index('source_file')['matched_metadata'].to_dict()

    # If a match is found, we add the state and location to the data frame.
    for index, row in cast_vote_data.iterrows():
        key_to_match = '_'.join(row['source_file'].split('_')[:2])
        
        matched_metadata = confirmed_matches.get(key_to_match, None)

        if matched_metadata is not None:
            cast_vote_data.loc[index, 'State'] = matched_metadata['State']
            cast_vote_data.loc[index, 'Location'] = matched_metadata['Jurisdiction']
            cast_vote_data.loc[index, 'Year'] = matched_metadata['Year']

    # Reordering the columns
    cols_to_order = ['source_file', 'type', 'State', 'Location', 'Year']
    new_columns = cols_to_order + (cast_vote_data.columns.drop(cols_to_order).tolist())
    cast_vote_data = cast_vote_data[new_columns]

    return cast_vote_data

cast_vote_data = consolidate_data_using_saved_matches(cast_vote_data, confirmed_matches_csv=f'{main_dir_path}/MatchedMetadata.csv')

In [None]:
# Print the first few rows of the consolidated DataFrame
print(cast_vote_data.head())

In [None]:
from sklearn.manifold import MDS
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean
import matplotlib.pyplot as plt
import numpy as np

# Filter DataFrame to include only the current election
election_df = cast_vote_data[cast_vote_data['source_file'] == 'Burlington_03072023_CityCouncilCentralDistrict.csv'].copy()

# Convert candidate names to numeric IDs for MDS
le = LabelEncoder()
for col in election_df.columns:
    if col.startswith('rank'):
        # Handle missing values by replacing them with a placeholder
        election_df[col] = election_df[col].fillna('Missing')
        election_df[col] = le.fit_transform(election_df[col])

# Perform MDS
mds = MDS(n_components=2, metric=False, dissimilarity='euclidean')
mds_coordinates = mds.fit_transform(election_df.drop(columns=['source_file', 'type']))

# Perform KMeans clustering
kmeans = KMeans(n_clusters=2)  # assuming two clusters; adjust as needed
kmeans.fit(mds_coordinates)

# Compute the inter-centroid distance
centroid1, centroid2 = kmeans.cluster_centers_
inter_centroid_distance = euclidean(centroid1, centroid2)

# Plot the MDS coordinates and color code them by cluster label
plt.scatter(mds_coordinates[:, 0], mds_coordinates[:, 1], c=kmeans.labels_)
plt.title(f'MDS plot for election Burlington_03072023_CityCouncilCentralDistrict.csv')
plt.show()

print('Inter-centroid distance:', inter_centroid_distance)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean
import matplotlib.pyplot as plt
import numpy as np

# Filter DataFrame to include only the current election
election_df = cast_vote_data[cast_vote_data['source_file'] == 'Burlington_03072023_CityCouncilCentralDistrict.csv'].copy()

# Convert candidate names to numeric IDs for PCA
le = LabelEncoder()
for col in election_df.columns:
    if col.startswith('rank'):
        # Handle missing values by replacing them with a placeholder
        election_df[col] = election_df[col].fillna('Missing')
        election_df[col] = le.fit_transform(election_df[col])

# Perform PCA
pca = PCA(n_components=2)
pca_coordinates = pca.fit_transform(election_df.drop(columns=['source_file', 'type']))

# Perform KMeans clustering
kmeans = KMeans(n_clusters=2)  # assuming two clusters; adjust as needed
kmeans.fit(pca_coordinates)

# Compute the inter-centroid distance
centroid1, centroid2 = kmeans.cluster_centers_
inter_centroid_distance = euclidean(centroid1, centroid2)

# Plot the PCA coordinates and color code them by cluster label
plt.scatter(pca_coordinates[:, 0], pca_coordinates[:, 1], c=kmeans.labels_)
plt.title(f'PCA plot for election Burlington_03072023_CityCouncilCentralDistrict.csv')
plt.show()

print('Inter-centroid distance:', inter_centroid_distance)

In [None]:
from sklearn.manifold import MDS
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from scipy.spatial import distance
import numpy as np

inter_centroid_distances = []

# Number of elections to sample
n_samples = 50

# Randomly sample a subset of the elections
elections_sample = np.random.choice(cast_vote_data['source_file'].unique(), size=n_samples, replace=False)

# Iterate over the sampled elections
for election in elections_sample:
    # Filter DataFrame to include only the current election
    election_df = cast_vote_data[cast_vote_data['source_file'] == election].copy()

    # Convert candidate names to numeric IDs for MDS
    le = LabelEncoder()
    for col in election_df.columns:
        if col.startswith('rank'):
            # Handle missing values by replacing them with a placeholder
            election_df[col] = election_df[col].fillna('Missing')
            election_df[col] = le.fit_transform(election_df[col])

    # Perform MDS
    mds = MDS(n_components=2)
    mds_coordinates = mds.fit_transform(election_df.drop(columns=['source_file', 'type']))

    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=2)  # assuming two clusters; adjust as needed
    kmeans.fit(pca_coordinates)

    # Compute the inter-centroid distance and store it
    centroid1, centroid2 = kmeans.cluster_centers_
    inter_centroid_distance = distance.euclidean(centroid1, centroid2)
    inter_centroid_distances.append(inter_centroid_distance)

    # Plot the MDS coordinates and color code them by cluster label
    plt.scatter(mds_coordinates[:, 0], mds_coordinates[:, 1], c=kmeans.labels_)
    plt.title(f'MDS plot for election {election}')
    plt.show()

# List of inter-centroid distances for the sampled elections, convert it to a pandas Series for easier manipulation
inter_centroid_distances = pd.Series(inter_centroid_distances, index=elections_sample)

# Reindex the inter_centroid_distances Series to match the order of the ballots_per_election Series
inter_centroid_distances = inter_centroid_distances.reindex(ballots_per_election.index)

# Plot the inter-centroid distances
inter_centroid_distances.plot(kind='barh', figsize=(10, 15))
plt.xlabel('Inter-centroid distance')
plt.ylabel('Election')
plt.title('Inter-centroid distances for each election')
plt.show()

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from scipy.spatial import distance
import numpy as np

inter_centroid_distances = []

# Iterate over all elections
for election in cast_vote_data['source_file'].unique():
    # Filter DataFrame to include only the current election
    election_df = cast_vote_data[cast_vote_data['source_file'] == election].copy()

    # Convert candidate names to numeric IDs for PCA
    le = LabelEncoder()
    for col in election_df.columns:
        if col.startswith('rank'):
            # Handle missing values by replacing them with a placeholder
            election_df[col] = election_df[col].fillna('Missing')
            election_df[col] = le.fit_transform(election_df[col])

    # Perform PCA
    pca = PCA(n_components=2)
    pca_coordinates = pca.fit_transform(election_df.drop(columns=['source_file', 'type']))

    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=2)  # assuming two clusters; adjust as needed
    kmeans.fit(pca_coordinates)

    # Compute the inter-centroid distance and store it
    centroid1, centroid2 = kmeans.cluster_centers_
    inter_centroid_distance = distance.euclidean(centroid1, centroid2)
    inter_centroid_distances.append((election, inter_centroid_distance))

# Create a DataFrame with inter-centroid distances for all elections
inter_centroid_df = pd.DataFrame(inter_centroid_distances, columns=['Election', 'Inter-centroid Distance'])
inter_centroid_df.set_index('Election', inplace=True)

# Sort the DataFrame by inter-centroid distance in descending order
sorted_inter_centroid_df = inter_centroid_df.sort_values(by='Inter-centroid Distance', ascending=False)

# Print the DataFrame
print(sorted_inter_centroid_df)

# Plot the inter-centroid distances
sorted_inter_centroid_df.plot(kind='barh', figsize=(10, 15), legend=False)
plt.xlabel('Inter-centroid Distance')
plt.ylabel('Election')
plt.title('Inter-centroid Distances for Each Election')
plt.show()

In [None]:
# Print each row in the sorted DataFrame one by one
for idx, row in sorted_inter_centroid_df.iterrows():
    print(f"Election: {idx}, Inter-centroid Distance: {row['Inter-centroid Distance']}")

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean
import matplotlib.pyplot as plt
import numpy as np

# Filter DataFrame to include only the current election
election_df = cast_vote_data[cast_vote_data['source_file'] == 'TakomaPark_11082022_Mayor.csv'].copy()

# Convert candidate names to numeric IDs for PCA
le = LabelEncoder()
for col in election_df.columns:
    if col.startswith('rank'):
        # Handle missing values by replacing them with a placeholder
        election_df[col] = election_df[col].fillna('Missing')
        election_df[col] = le.fit_transform(election_df[col])

# Perform PCA
pca = PCA(n_components=2)
pca_coordinates = pca.fit_transform(election_df.drop(columns=['source_file', 'type']))

# Perform KMeans clustering
kmeans = KMeans(n_clusters=2)  # assuming two clusters; adjust as needed
kmeans.fit(pca_coordinates)

# Compute the inter-centroid distance
centroid1, centroid2 = kmeans.cluster_centers_
inter_centroid_distance = euclidean(centroid1, centroid2)

# Plot the PCA coordinates and color code them by cluster label
plt.scatter(pca_coordinates[:, 0], pca_coordinates[:, 1], c=kmeans.labels_)
plt.title(f'PCA plot for election TakomaPark_11082022_Mayor.csv')
plt.show()

print('Inter-centroid distance:', inter_centroid_distance)

In [None]:
from sklearn.manifold import MDS
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean
import matplotlib.pyplot as plt
import numpy as np

# Filter DataFrame to include only the current election
election_df = cast_vote_data[cast_vote_data['source_file'] == 'TakomaPark_11082022_Mayor.csv'].copy()

# Convert candidate names to numeric IDs for PCA
le = LabelEncoder()
for col in election_df.columns:
    if col.startswith('rank'):
        # Handle missing values by replacing them with a placeholder
        election_df[col] = election_df[col].fillna('Missing')
        election_df[col] = le.fit_transform(election_df[col])

# Perform MDS
mds = MDS(n_components=2)
mds_coordinates = mds.fit_transform(election_df.drop(columns=['source_file', 'type']))

# Perform KMeans clustering
kmeans = KMeans(n_clusters=2)  # assuming two clusters; adjust as needed
kmeans.fit(mds_coordinates)

# Compute the inter-centroid distance
centroid1, centroid2 = kmeans.cluster_centers_
inter_centroid_distance = euclidean(centroid1, centroid2)

# Plot the MDS coordinates and color code them by cluster label
plt.scatter(mds_coordinates[:, 0], mds_coordinates[:, 1], c=kmeans.labels_)
plt.title(f'MDS plot for election TakomaPark_11082022_Mayor.csv')
plt.show()

print('Inter-centroid distance:', inter_centroid_distance)

In [None]:
import matplotlib.pyplot as plt

# Count the number of ballots cast for each election
ballots_per_election = cast_vote_data['source_file'].value_counts().sort_values()

# Plot the data
plt.figure(figsize=(10, 15))
ballots_per_election.plot(kind='barh')
plt.xlabel('Number of ballots cast')
plt.ylabel('Election')
plt.title('Number of ballots cast in each election')
plt.show()

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from scipy.spatial import distance
import numpy as np

inter_centroid_distances = []

# Number of elections to sample
n_samples = 50

# Randomly sample a subset of the elections
elections_sample = np.random.choice(cast_vote_data['source_file'].unique(), size=n_samples, replace=False)

# Iterate over the sampled elections
for election in elections_sample:
    # Filter DataFrame to include only the current election
    election_df = cast_vote_data[cast_vote_data['source_file'] == election].copy()

    # Convert candidate names to numeric IDs for PCA
    le = LabelEncoder()
    for col in election_df.columns:
        if col.startswith('rank'):
            # Handle missing values by replacing them with a placeholder
            election_df[col] = election_df[col].fillna('Missing')
            election_df[col] = le.fit_transform(election_df[col])

    # Perform PCA
    pca = PCA(n_components=2)
    pca_coordinates = pca.fit_transform(election_df.drop(columns=['source_file', 'type']))

    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=2)  # assuming two clusters; adjust as needed
    kmeans.fit(pca_coordinates)

    # Compute the inter-centroid distance and store it
    centroid1, centroid2 = kmeans.cluster_centers_
    inter_centroid_distance = distance.euclidean(centroid1, centroid2)
    inter_centroid_distances.append(inter_centroid_distance)

    # Plot the PCA coordinates and color code them by cluster label
    plt.scatter(pca_coordinates[:, 0], pca_coordinates[:, 1], c=kmeans.labels_)
    plt.title(f'PCA plot for election {election}')
    plt.show()

# List of inter-centroid distances for the sampled elections, convert it to a pandas Series for easier manipulation
inter_centroid_distances = pd.Series(inter_centroid_distances, index=elections_sample)

# Reindex the inter_centroid_distances Series to match the order of the ballots_per_election Series
inter_centroid_distances = inter_centroid_distances.reindex(ballots_per_election.index)

# Plot the inter-centroid distances
inter_centroid_distances.plot(kind='barh', figsize=(10, 15))
plt.xlabel('Inter-centroid distance')
plt.ylabel('Election')
plt.title('Inter-centroid distances for each election')
plt.show()