In [4]:
import sys
import os

# Add the parent directory to the sys.path
sys.path.append(os.path.join(os.getcwd(), '..'))

In [10]:
import glob
import csv

import numpy as np
import pandas as pd
import seaborn as sns

from scipy.stats import kurtosis, skew, rankdata
from scipy.signal import argrelextrema
from scipy.stats.kde import gaussian_kde

from rcv_learning.rcv_distribution import *
from rcv_learning.rcv_dimensionality import *

  from scipy.stats.kde import gaussian_kde


In [12]:
# check the edge case for sarles against ashmans D

# compare likelihoods for the unimodal vs multimodal, bimodal ratio (make pretty historgram)

In [13]:
def compute_ecdf(data):
    """Compute the empirical cumulative distribution function (ECDF) of data."""
    sorted_data = np.sort(data)
    n = len(data)
    return sorted_data, np.arange(1, n+1) / n

def dip_statistic(data):
    """Compute Hartigan's dip statistic for data."""
    data = np.sort(data)
    n = len(data)
    
    # Compute the empirical CDF
    x, ecdf = compute_ecdf(data)
    
    # Compute the greatest difference on the left and right side for each data point
    U = np.arange(1, n+1) / n
    D = U - np.arange(0, n) / n
    
    # Calculate the difference between the empirical distribution function and the 
    # unimodal distribution function that minimizes that maximum difference.
    low_dip = (ecdf - D).clip(min=0)
    up_dip = (U - ecdf).clip(min=0)
    both_dips = np.column_stack((low_dip, up_dip))
    
    # The dip statistic is the maximum difference over all data points
    dip = np.max(both_dips)
    
    return dip

In [14]:
# Load the existing CSV into a DataFrame
df = pd.read_csv('election_table.csv')

# Search for CSV files in the directory and its subdirectories
csv_files = glob.glob("../rcv_elections_database/**/*.csv", recursive=True)

# Iterate over the CSV files
for file_path in csv_files:

    # Get the filename only
    filename = file_path.split('/')[-1]  
    print(file_path)

    # Locate the row for the current file
    row_indices = df[df['filename'] == filename].index
    
    # Check if there's a match
    if len(row_indices) == 0:
        print(f"No matching row found for file: {filename}")
        continue  # Skip to the next iteration

    row_index = row_indices[0]
    print(row_index)

    try:
        # Get the consistency points for the election
        points = get_consistency_points(file_path)
            
        # Create a list of data points
        data_points = []
        for key, value in points.items():
            data_points.extend([key] * value)

        # Convert to numpy array
        data_points = np.array(data_points)

        # Calculate skewness and kurtosis
        g = skew(data_points)
        k = kurtosis(data_points)

        # Calculate KDE without plotting
        density = gaussian_kde(data_points)
        x_vals = np.linspace(min(data_points), max(data_points), 1000)
        y_vals = density(x_vals)

        # Identify local maxima in the KDE
        maxima_indices = argrelextrema(y_vals, np.greater)

        # Get the x-values of the maxima
        modes = x_vals[maxima_indices]

        # Check if there are at least two modes
        if len(modes) >= 2:
            mode1, mode2 = modes[:2]
        else:
            mode1, mode2 = None, None

        # Split the data into two groups based on proximity to the modes
        data_group1 = [point for point in data_points if abs(point - mode1) < abs(point - mode2)]
        data_group2 = [point for point in data_points if abs(point - mode1) > abs(point - mode2)]

        # Calculate mean and variance for each group
        mu1, sigma1_sq = np.mean(data_group1), np.var(data_group1)
        mu2, sigma2_sq = np.mean(data_group2), np.var(data_group2)

        # Calculate Ashman's D statistic (D > 2) #! Check how it breaks
        ashmans_D = abs(mu1 - mu2) / np.sqrt((sigma1_sq + sigma2_sq) / 2)

        # Calculate Sarle's bimodality coefficient b (b > 5/9)
        n = len(data_points)
        sarle = (g**2 + 1) / (k + 3 * (n-1)**2 / ((n-2) * (n-3)))

        # Calculate Hartigan's dip statistic ... (TODO - Ensure you have the dip_statistic function)
        hartigan_dip = dip_statistic(data_points)

        # Add new columns to the DataFrame for the current file
        df.at[row_index, 'Ashman_D'] = ashmans_D
        df.at[row_index, 'Sarle_coefficient'] = sarle
        df.at[row_index, 'Hartigan_Dip'] = hartigan_dip

    except Exception as e:
        print("Error in file: {}".format(file_path))
        print(e)

# Save the updated DataFrame back to CSV
df.to_csv('election_table_2.csv', index=False)

../rcv_elections_database/CandidateDetails.csv
No matching row found for file: CandidateDetails.csv
../rcv_elections_database/SingleWinnerRCV.csv
No matching row found for file: SingleWinnerRCV.csv
../rcv_elections_database/MatchedElections.csv
No matching row found for file: MatchedElections.csv
../rcv_elections_database/SequentialRCV.csv
No matching row found for file: SequentialRCV.csv
../rcv_elections_database/ProportionalRCV.csv
No matching row found for file: ProportionalRCV.csv
../rcv_elections_database/proportional/Minneapolis_11072017_BoardofEstimateandTaxation.csv
No matching row found for file: Minneapolis_11072017_BoardofEstimateandTaxation.csv
../rcv_elections_database/proportional/Minneapolis_11072017_ParkBoardAtLarge.csv
No matching row found for file: Minneapolis_11072017_ParkBoardAtLarge.csv
../rcv_elections_database/proportional/Cambridge_11082011_CityCouncil.csv
No matching row found for file: Cambridge_11082011_CityCouncil.csv
../rcv_elections_database/proportional/

  distance = 1 / np.sqrt(freq_upper_triangle)


../rcv_elections_database/proportional/Cambridge_11052013_CityCouncil.csv
No matching row found for file: Cambridge_11052013_CityCouncil.csv
../rcv_elections_database/proportional/Minneapolis_11062009_MinneapolisParkRecBoard.csv
No matching row found for file: Minneapolis_11062009_MinneapolisParkRecBoard.csv
../rcv_elections_database/proportional/Cambridge_11042003_SchoolCommittee.csv
No matching row found for file: Cambridge_11042003_SchoolCommittee.csv
../rcv_elections_database/proportional/Cambridge_11072017_SchoolCommittee.csv
No matching row found for file: Cambridge_11072017_SchoolCommittee.csv
../rcv_elections_database/proportional/Cambridge_11032015_CityCouncil.csv
No matching row found for file: Cambridge_11032015_CityCouncil.csv
../rcv_elections_database/proportional/Cambridge_11082005_CityCouncil.csv
No matching row found for file: Cambridge_11082005_CityCouncil.csv
../rcv_elections_database/proportional/Minneapolis 2013-board of estimation and taxation cvr.csv
No matching r

  distance = 1 / np.sqrt(freq_upper_triangle)


../rcv_elections_database/classic/Alaska_08162022_HouseofRepresentativesSpecial.csv
1


  distance = 1 / np.sqrt(freq_upper_triangle)


KeyboardInterrupt: 