In [2]:
# gubenatorial
# senate


# correlation table on the bimodality measures

In [3]:
# bimodality & gamma and serve the brutons!@#$%^&*()_+ 
#(new york election, maybe use a plotting method to inspect individual points more easliy [seaborn?])
# code "problem" elections, maybe sort across and see if there are any patterns
# get a rule that culls elections and only plot problem elections
# 

In [4]:
import sys
import os

# Add the parent directory to the sys.path
sys.path.append(os.path.join(os.getcwd(), '..'))

In [5]:
import seaborn as sns
import numpy as np

from scipy.stats import kurtosis, skew, gaussian_kde
from scipy.signal import argrelextrema

from rcv_learning.rcv_distribution import *
from rcv_learning.rcv_dimensionality import *

In [6]:
def compute_ecdf(data):
    """Compute the empirical cumulative distribution function (ECDF) of data."""
    sorted_data = np.sort(data)
    n = len(data)
    return sorted_data, np.arange(1, n+1) / n

def dip_statistic(data):
    """Compute Hartigan's dip statistic for data."""
    data = np.sort(data)
    n = len(data)
    
    # Compute the empirical CDF
    x, ecdf = compute_ecdf(data)
    
    # Compute the greatest difference on the left and right side for each data point
    U = np.arange(1, n+1) / n
    D = U - np.arange(0, n) / n
    
    # Calculate the difference between the empirical distribution function and the unimodal distribution function that minimizes that maximum difference.
    low_dip = (ecdf - D).clip(min=0)
    up_dip = (U - ecdf).clip(min=0)
    both_dips = np.column_stack((low_dip, up_dip))
    
    # The dip statistic is the maximum difference over all data points
    dip = np.max(both_dips)
    
    return dip

In [53]:
# Test the function with a custom CSV file
csv = "../rcv_elections_database/single/Wyoming_04172020_PRESIDENTOFTHEUNITEDSTATES.csv"

# Determine save behavior
save = True
filename = csv.split("/")[-1]
filename_prefix = f"plots/{filename}"

# Perform the RCV analysis
test = perform_rcv_analysis(csv, n_runs=1000)
mds_1d_coordinates, mds_2d_coordinates, most_common_order, order_frequencies, candidate_names = test

# Print the normalized distances between candidates and plot the MDS analysis
normalized_distances = get_distances_normalized(most_common_order, mds_1d_coordinates, candidate_names)
print("Normalized distances:", normalized_distances)
plot_rcv_analysis(mds_1d_coordinates, mds_2d_coordinates, most_common_order, order_frequencies, candidate_names, save=save, filename=filename_prefix)

# Get the consistency points for the bimodality analysis
points = get_consistency_points(csv)
print("Consistency points:", points)

# Create a list of data points
data_points = []
for key, value in points.items():
    data_points.extend([key] * value)

# Convert to numpy array
data_points = np.array(data_points)

# Calculate skewness and kurtosis
g = skew(data_points)
k = kurtosis(data_points)

# Calculate KDE without plotting
density = gaussian_kde(data_points)
x_vals = np.linspace(min(data_points), max(data_points), 1000)
y_vals = density(x_vals)

# Identify local maxima in the KDE
maxima_indices = argrelextrema(y_vals, np.greater)

# Get the x-values of the maxima
modes = x_vals[maxima_indices]

# Check if there are at least two modes
if len(modes) >= 2:
    mode1, mode2 = modes[:2]

    # Calculate the amplitudes of the two modes
    amp1 = density(mode1)[0]
    amp2 = density(mode2)[0]

    # Calculate the amplitude ratio with higher amplitude as denominator
    if amp1 > amp2:
        amplitude_ratio = amp2 / amp1
    else:
        amplitude_ratio = amp1 / amp2

    # Calculate x-axis distance between the two modes
    mode_distance = abs(mode2 - mode1)

else:
    mode1, mode2 = None, None
    amplitude_ratio = None
    mode_distance = None

try:
    # Split the data into two groups based on proximity to the modes
    data_group1 = [point for point in data_points if abs(point - mode1) < abs(point - mode2)]
    data_group2 = [point for point in data_points if abs(point - mode1) > abs(point - mode2)]

    # Display mean and variance for each group
    mu1, sigma1_sq = np.mean(data_group1), np.var(data_group1)
    mu2, sigma2_sq = np.mean(data_group2), np.var(data_group2)

    # Calculate Ashman's D statistic (D > 2)
    ashmans_D = abs(mu1 - mu2) / np.sqrt((sigma1_sq + sigma2_sq) / 2)
except:
    mode_distance = 404
    amplitude_ratio = 404
    ashmans_D = 404

# Calculate Sarle's bimodality coefficient b (b > 5/9)
n = len(data_points)
sarle = (g**2 + 1) / (k + 3 * (n-1)**2 / ((n-2) * (n-3)))

# Calculate Hartigan's dip statistic (calculate p value)
hartigan_dip = dip_statistic(data_points)

# Prepare data for histogram
data_list = [x for x, count in points.items() for _ in range(count)]
normalized_points = []
normalized_names = []
for name in normalized_distances:
    normalized_names.append(name)
    normalized_points.append(normalized_distances[name])

# Plot histogram
plt.figure(figsize=(10, 6))
plt.hist(data_list, bins=50, density=True, alpha=0.7)
plt.title('Histogram of Data')
plt.xticks(normalized_points, normalized_names, rotation=45)
plt.xlabel('Value')
plt.ylabel('Density')
plt.grid(True)

# Display measures in a text box
text_str = f"Skewness: {g:.2f}\nKurtosis: {k:.2f}\nMode Distance: {mode_distance:.2f}"
plt.text(0.05, 0.95, text_str, transform=plt.gca().transAxes, fontsize=10, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.5))

if save:
    plt.savefig(f"{filename_prefix}_hist.png", bbox_inches='tight')
    plt.close()
else:
    plt.show()

# Plot kernal density estimation
plt.figure(figsize=(10, 6))
sns.kdeplot(data_list, fill=True)
plt.title('Kernel Density Estimation of Data')
plt.xticks(normalized_points, normalized_names, rotation=45)
plt.xlabel('Value')
plt.ylabel('Density')
plt.grid(True)

# Display bimodality measures in a text box
text_str = f"Ashman's D: {ashmans_D:.10f}\nSarle's Coefficient: {sarle:.10f}\nHartigan's Dip: {hartigan_dip:.10f}\nAmplitude Ratio: {amplitude_ratio:.10f}"
plt.text(0.05, 0.95, text_str, transform=plt.gca().transAxes, fontsize=10, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.5))

if save:
    plt.savefig(f"{filename_prefix}_kde.png", bbox_inches='tight')
    plt.close()
else:
    plt.show()

  distance = 1 / np.sqrt(freq_upper_triangle)


Normalized distances: {'Tom Steyer': 0.0, 'Pete Buttigieg': 1.6495163108636355, 'Michael R. Bloomberg': 2.8359621738191514, 'Amy Klobuchar': 3.1870559040940774, 'Elizabeth Warren': 3.594452193656948, 'Bernie Sanders': 4.067825929677619, 'Joseph R. Biden': 4.138892437653406, '(undeclared)': 6.057519997888765, 'Tulsi Gabbard': 8.0}


  distance = 1 / np.sqrt(freq_upper_triangle)


Consistency points: {4.260463276280078: 636, 4.024807137358229: 2402, 3.9462550910509457: 3270, 3.682402918005447: 46, 3.980831775183979: 205, 3.76095496431273: 25, 6.057519997888763: 107, 3.1135354031270994: 10, 8.0: 32, 1.6495163108636353: 8, 5.195347457210058: 60, 3.92990723898862: 58, 3.8860205227042544: 75, 3.907963880846437: 2, 4.210219144714901: 3, 3.7216789411590883: 53, 4.082049035490957: 9, 3.631478381810088: 6, 3.9553695070862993: 15, 4.959691318288209: 26, 2.128801107540854: 11, 3.1920874494343825: 6, 3.8338110144117175: 9, 0.9865637727627364: 4, 3.673104074474034: 3, 4.7097274566822485: 56, 2.223701005910463: 21, 2.835962173819151: 13, 3.3768557008332944: 8, 2.2629770290641043: 8, 3.904671927542749: 2, 4.8689790865330025: 4, 2.076630874307236: 1, 2.179725643736213: 7, 4.4740713177604: 58, 3.052121619312311: 1, 3.1846632516794893: 5, 3.8118676562695346: 4, 2.8806638920474357: 1, 2.168077130694496: 3, 2.12645143803055: 1, 3.612324828136743: 5, 4.017974579381618: 4, 3.5944521