## Plot 1 - Sum of Std Dev vs. Centroid Distance

In [1]:
ELECTION_DIR = 'scot-elex-main'
CLUSTERINGS = '2_and_3_clusterings.pkl'

import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from Clustering_Functions import Borda_vector, HH_proxy

In [2]:
# All 7 cand elections
clusterings = pd.read_pickle(CLUSTERINGS)

In [6]:
two_clusterings = clusterings[clusterings.num_clusters == 2]
methods = ['continuous', 'continuous_rest', 'discrete']
proxies = ['BP', 'BA', 'HH']

In [9]:
def clustering_to_proxy(clustering, proxy, num_cands):
    proxy_clustering = {}
    for c_index, assignment in clustering.items():
        proxy_assignment = {}
        for ballot, weight in assignment.items():
            if proxy == 'BP':
                ballot_proxy = Borda_vector(ballot, num_cands=num_cands, borda_style='pes')
            elif proxy == 'BA':
                ballot_proxy = Borda_vector(ballot, num_cands=num_cands, borda_style='avg')
            elif proxy == 'HH':
                ballot_proxy = HH_proxy(ballot, num_cands=num_cands)
            else:
                raise ValueError("Invalid proxy type")
            proxy_assignment[tuple(ballot_proxy)] = proxy_assignment.setdefault(tuple(ballot_proxy), 0) + weight 
        proxy_clustering[c_index] = proxy_assignment
    return proxy_clustering

def cluster_stdev(centroid, points):
    # Extract L1 distances and corresponding weights
    centroid = np.array(centroid)
    distances = []
    weights = []
    for point, weight in points.items():
        point = np.array(point)
        l1_dist = np.sum(np.abs(point - centroid))
        distances.append(l1_dist)
        weights.append(weight)
    
    # Convert to numpy arrays for vectorized operations
    distances = np.array(distances)
    weights = np.array(weights)
    
    sum_weights = np.sum(weights)
    if sum_weights == 0:
        raise ValueError("Sum of weights cannot be zero.")
    
    # Calculate weighted mean of distances
    weighted_mean = np.sum(distances * weights) / sum_weights
    
    # Compute weighted variance
    squared_diffs = (distances - weighted_mean) ** 2
    weighted_variance = np.sum(weights * squared_diffs) / sum_weights
    
    # Standard deviation is the square root of variance
    return np.sqrt(weighted_variance)



In [None]:
PLOT_1_SAVE = '7CandidateElectionPlots/stdev_v_distance'
plot_1_data = pd.DataFrame(columns=['filename', 'method', 'proxy_type', 'stdev', 'distance'])
for method in methods:
    for proxy in proxies:
        elections = two_clusterings[(two_clusterings.method == method) & (two_clusterings.proxy_type == proxy)]
        election_names = []
        stdevs = [] 
        distances = []

        for i, election in elections.iterrows():
            clustering = election.clustering
            centroids = election.proxies_of_centers
            num_cands = election.num_cands
            proxy_clustering = clustering_to_proxy(clustering, proxy, num_cands=num_cands)
            stdev = cluster_stdev(centroids[0], proxy_clustering[0]) + cluster_stdev(centroids[1], proxy_clustering[1])
            stdevs.append(stdev)
            distance = np.sum(np.abs(np.array(centroids[0]) - np.array(centroids[1])))
            distances.append(distance)
            # Append to plot_1_data
            plot_1_data.loc[i] = {
                'filename': election['filename'],
                'method': method,
                'proxy_type': proxy,
                'stdev': stdev,
                'distance': distance
            }


        plt.scatter(distances, stdevs)
        plt.title(f"Standard Deviation vs Distance for {method} {proxy}")
        plt.xlabel("Distance between centroids")
        plt.ylabel("Sum of standard deviations")
        plt.savefig(f"{PLOT_1_SAVE}/{method}_{proxy}.png")
        plt.close()


In [18]:
plot_1_data.query("4.0 < stdev < 4.4 and 20 < distance < 24 ")

Unnamed: 0,filename,method,proxy_type,stdev,distance
1215,edinburgh_2017_ward6.csv,discrete,HH,4.385784,23.0
1216,edinburgh_2022_ward10.csv,discrete,HH,4.081137,23.0


In [22]:
plot_1_data.query("filename == 'edinburgh_2022_ward10.csv' or filename == 'edinburgh_2017_ward6.csv'")

Unnamed: 0,filename,method,proxy_type,stdev,distance
87,edinburgh_2017_ward6.csv,continuous,BP,8.249481,20.0
88,edinburgh_2022_ward10.csv,continuous,BP,7.229138,17.0
369,edinburgh_2017_ward6.csv,continuous_rest,BP,8.21418,21.0
370,edinburgh_2022_ward10.csv,continuous_rest,BP,7.78605,19.0
933,edinburgh_2017_ward6.csv,discrete,BP,8.21418,21.0
934,edinburgh_2022_ward10.csv,discrete,BP,7.78605,19.0
651,edinburgh_2017_ward6.csv,discrete,BA,6.397317,14.0
652,edinburgh_2022_ward10.csv,discrete,BA,6.247131,16.0
1215,edinburgh_2017_ward6.csv,discrete,HH,4.385784,23.0
1216,edinburgh_2022_ward10.csv,discrete,HH,4.081137,23.0


In [6]:
PLOT_1_SAVE = 'plots/stdev_v_distance_by_cluster'
for method in methods:
    for proxy in proxies:
        elections = two_clusterings[(two_clusterings.method == method) & (two_clusterings.proxy_type == proxy)]
        election_names = []
        stdevs = [] 
        distances = []

        for i, election in elections.iterrows():
            clustering = election.clustering
            centroids = election.proxies_of_centers
            num_cands = election.num_cands
            proxy_clustering = clustering_to_proxy(clustering, proxy, num_cands=num_cands)
            weights_percent_0 = sum(clustering[0].values()) / (sum(clustering[0].values()) + sum(clustering[1].values()))
            stdevs.append(cluster_stdev(centroids[0], proxy_clustering[0]))
            stdevs.append(cluster_stdev(centroids[1], proxy_clustering[1]))
            distances.append(weights_percent_0 * (np.sum(np.abs(np.array(centroids[0]) - np.array(centroids[1])))))
            distances.append((1-weights_percent_0) * (np.sum(np.abs(np.array(centroids[0]) - np.array(centroids[1])))))

        plt.scatter(distances, stdevs)
        plt.title(f"Standard Deviation vs Distance for {method} {proxy}")
        plt.xlabel("Distance between centroids (prorated by cluster weight))")
        plt.ylabel("Standard deviation")
        plt.savefig(f"{PLOT_1_SAVE}/{method}_{proxy}.png")
        plt.close()


In [19]:
PLOT_1_SAVE = 'plots/stdev_v_stdev'
for method in methods:
    for proxy in proxies:
        elections = two_clusterings[(two_clusterings.method == method) & (two_clusterings.proxy_type == proxy)]
        election_names = []
        stdev_1 = [] 
        stdev_2 = []

        for i, election in elections.iterrows():
            clustering = election.clustering
            centroids = election.proxies_of_centers
            num_cands = election.num_cands
            proxy_clustering = clustering_to_proxy(clustering, proxy, num_cands=num_cands)
            stdev_1.append(0.5*cluster_stdev(centroids[0], proxy_clustering[0]))
            stdev_2.append(0.5*cluster_stdev(centroids[1], proxy_clustering[1]))

        plt.scatter(stdev_1, stdev_2)
        plt.title(f"StDev vs StDev of each Cluster (unweighted) for {method} {proxy}")
        plt.xlabel("Standard Deviation (Cluster 1)")
        plt.ylabel("Standard Deviation (Cluster 2)")
        plt.savefig(f"{PLOT_1_SAVE}/{method}_{proxy}.png")
        plt.close()


In [18]:
PLOT_1_SAVE = 'plots/stdev_v_stdev_weighted'
for method in methods:
    for proxy in proxies:
        elections = two_clusterings[(two_clusterings.method == method) & (two_clusterings.proxy_type == proxy)]
        election_names = []
        stdev_1 = [] 
        stdev_2 = []

        for i, election in elections.iterrows():
            clustering = election.clustering
            centroids = election.proxies_of_centers
            num_cands = election.num_cands
            proxy_clustering = clustering_to_proxy(clustering, proxy, num_cands=num_cands)
            weights_percent_0 = sum(clustering[0].values()) / (sum(clustering[0].values()) + sum(clustering[1].values()))
            stdev_1.append(weights_percent_0*cluster_stdev(centroids[0], proxy_clustering[0]))
            stdev_2.append((1-weights_percent_0)*cluster_stdev(centroids[1], proxy_clustering[1]))

        plt.scatter(stdev_1, stdev_2)
        plt.title(f"StDev vs StDev of each Cluster (weighted) for {method} {proxy}")
        plt.xlabel("Standard deviation weighted by cluster size (Cluster 1)")
        plt.ylabel("Standard deviation weighted by cluster size (Cluster 2)")
        plt.savefig(f"{PLOT_1_SAVE}/{method}_{proxy}.png")
        plt.close()


In [8]:
import numpy as np

def weighted_unique_coverage(centroids, points_list, alphas):
    """
    Parameters:
      centroids   : list of centroid coordinates (each a list, tuple, or np.array)
      points_list : list of dictionaries. Each dictionary maps a point (list, tuple, etc.)
                    to its weight. The i-th dictionary corresponds to the i-th centroid.
      alphas      : list of multipliers (one per centroid) for the standard deviation
      
    Returns:
      (fraction_1x, fraction_alpha)
      where:
        - fraction_1x   is the weighted fraction of global points that lie within exactly
                          one cluster’s 1× standard deviation threshold.
        - fraction_alpha is the weighted fraction of global points that lie within exactly
                          one cluster’s α× standard deviation threshold.
    """
    n_clusters = len(centroids)
    if not (len(points_list) == len(alphas) == n_clusters):
        raise ValueError("Arrays centroids, points_list, and alphas must all be the same length.")

    # Step 1: Compute each cluster's weighted standard deviation and its thresholds.
    thresholds = []       # 1× standard deviation thresholds for each cluster
    alpha_thresholds = [] # α× standard deviation thresholds for each cluster
    for centroid, pts_dict, alpha in zip(centroids, points_list, alphas):
        c = np.array(centroid)
        dists = []
        ws = []
        for pt, w in pts_dict.items():
            pt_arr = np.array(pt)
            # L1 distance from point to centroid
            dist = np.sum(np.abs(pt_arr - c))
            dists.append(dist)
            ws.append(w)
        dists = np.array(dists)
        ws = np.array(ws)
        total_w = np.sum(ws)
        if total_w == 0:
            raise ValueError("Total weight for a cluster cannot be zero.")
        # Compute weighted mean and variance (for the distances)
        weighted_mean = np.sum(dists * ws) / total_w
        variance = np.sum(ws * (dists - weighted_mean) ** 2) / total_w
        stdev = np.sqrt(variance)
        
        thresholds.append(stdev)
        alpha_thresholds.append(alpha * stdev)
    
    # Step 2: Combine all points from all clusters into a global dictionary.
    # If the same point appears in multiple clusters, sum its weights.
    global_points = {}
    for pts_dict in points_list:
        for pt, w in pts_dict.items():
            # Ensure the point is hashable (e.g., convert lists to tuples)
            pt_key = tuple(pt)
            global_points[pt_key] = global_points.get(pt_key, 0) + w
            
    total_global_weight = sum(global_points.values())
    if total_global_weight == 0:
        raise ValueError("Global sum of weights cannot be zero.")

    # Step 3: For each global point, count in how many clusters it lies within the threshold.
    numerator_1x = 0.0    # For points within exactly one cluster's 1× stdev threshold.
    numerator_alpha = 0.0 # For points within exactly one cluster's α× stdev threshold.
    
    for pt, weight in global_points.items():
        pt_arr = np.array(pt)
        count_1x = 0
        count_alpha = 0
        # Check the point against every cluster.
        for centroid, thresh, alphathresh in zip(centroids, thresholds, alpha_thresholds):
            c = np.array(centroid)
            dist = np.sum(np.abs(pt_arr - c))
            if dist <= thresh:
                count_1x += 1
            if dist <= alphathresh:
                count_alpha += 1
        # Count the point only if it qualifies for exactly one cluster.
        if count_1x == 1:
            numerator_1x += weight
        if count_alpha == 1:
            numerator_alpha += weight
    
    fraction_1x = numerator_1x / total_global_weight
    fraction_alpha = numerator_alpha / total_global_weight
    
    return fraction_1x, fraction_alpha


In [13]:
PLOT_SAVE = 'plots/stdev_coverage'
plot_5_data = pd.DataFrame(columns=['filename', 'method', 'proxy_type', 'unweighted', 'weighted'])
for method in methods:
    for proxy in proxies:
        elections = two_clusterings[(two_clusterings.method == method) & (two_clusterings.proxy_type == proxy)]
        election_names = []
        pct_covered = [] 
        weighted_pct_covered = []

        for i, election in elections.iterrows():
            clustering = election.clustering
            centroids = election.proxies_of_centers
            num_cands = election.num_cands
            proxy_clustering = clustering_to_proxy(clustering, proxy, num_cands=num_cands)

            weights_percent_0 = sum(clustering[0].values()) / (sum(clustering[0].values()) + sum(clustering[1].values()))
            fraction, fraction_weighted = weighted_unique_coverage(centroids.values(), proxy_clustering.values(), [weights_percent_0, 1-weights_percent_0])
            pct_covered.append(fraction)
            weighted_pct_covered.append(fraction_weighted)
            plot_5_data.loc[i] = {
                'filename': election['filename'],
                'method': method,
                'proxy_type': proxy,
                'unweighted': fraction,
                'weighted': fraction_weighted
            }

        plt.scatter(pct_covered, weighted_pct_covered)
        plt.title(f"Fraction within one stdev of unique cluster ({method} {proxy})", pad=20)
        plt.title
        plt.xlabel("Fraction within one stdev of unique cluster (unweighted)")
        plt.ylabel("Fraction within one stdev of unique cluster (weighted)")
        plt.savefig(f"{PLOT_SAVE}/{method}_{proxy}.png")
        plt.close()
