This notebook deals with converting solution files into a consolidated dataframe of all the model results. The first part of the notebook generates a dataframe from the all_models directory (the initial cluster run that contains some errors). The second part of the notebook generates a dataframe for the clean feb 25 runs.

In [1]:
import importlib
import analysis

importlib.reload(analysis)
from analysis import get_centroids
import Clustering_Functions
importlib.reload(Clustering_Functions)
from Clustering_Functions import HH_proxy, Borda_vector, csv_parse


import numpy as np
import os
import zipfile
import re
from pathlib import Path
import pandas as pd


In [2]:
def parse_filename(filename: str, base_directory: str = "scot-elex-main") -> dict:
    """
    Parse a solution filename into its components

    Args:
    filename: str
        The filename to parse
    base_directory: str
        The directory containing the election data csv files
    """
    parts = filename.replace('.sol', '').split('_')
    
    # Handle the special case of 'continuous_rest'
    if parts[0] == 'continuous' and parts[1] == 'rest':
        method = 'continuous_rest'
        proxy = parts[2].lower()
        election_parts = parts[3:-2]
    else:
        method = parts[0]
        proxy = parts[1].lower()
        election_parts = parts[2:-2]
    
    num_clusters = int(parts[-1])
    num_candidates = int(parts[-2])
    election = '_'.join(election_parts)

    # Get raw ballots from csv in base_directory
    csv_path = os.path.join(base_directory, f"{election}.csv")
    _, election_data, _, _ = csv_parse(csv_path)
    # Process ballots based on proxy type
    processed_ballots = {}
    
    if proxy == 'hh':
        for ballot, count in election_data.items():
            processed_ballot = 2*HH_proxy(ballot, num_candidates)
            processed_ballots[ballot] = tuple(processed_ballot)
            
    elif proxy.startswith('borda'):
        borda_style = 'avg' if proxy == 'bordaa' else 'pes'
        
        for ballot, count in election_data.items():
            processed_ballot = Borda_vector(ballot, num_candidates, borda_style=borda_style)
            processed_ballots[ballot] = tuple(processed_ballot)
    return {
        'method': method,
        'proxy': proxy,
        'election': election,
        'num_candidates': int(num_candidates),
        'num_clusters': int(num_clusters),
        'original_ballots': election_data,
        'proxy_conversion': processed_ballots
    }

In [3]:
def assign_ballots(centroids: dict, original_ballots: dict, proxy_conversion: dict) -> dict:
    clusters = {i: {} for i in centroids}
    cost = 0
    for (original_ballot, count) in original_ballots.items():
       proxy_ballot = tuple(proxy_conversion[original_ballot])
       original_ballot = tuple(original_ballot)
       distances = {c: sum(abs(b - p) for b, p in zip(proxy_ballot, centroids[c])) for c in centroids}
       min_dist = min(distances.values())
       closest = [c for c, d in distances.items() if d == min_dist]
       split_count = count / len(closest)
       cost += min_dist * count
       for c in closest:
           clusters[c][original_ballot] = clusters[c].get(original_ballot, 0) + split_count

    # Assert that sum of weights in clusters is same as sum of original weights in both proxy and original
    assert np.abs(sum(sum(c.values()) for c in clusters.values()) - sum(original_ballots.values())) < 1e-6
    return clusters, cost

In [221]:
results_data = []
directory = "all_models"
zip_pattern = re.compile(r'\d+_cand_results?\.zip')
# zip_pattern = re.compile(r'03_cand_results?\.zip')

# process each zip file
for filename in os.listdir(directory):
    if not zip_pattern.match(filename):
        continue
        
    zip_path = os.path.join(directory, filename)
    print(f"Processing {filename}...")
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # process each solution file
        for sol_file in zip_ref.namelist():
            if not sol_file.endswith('.sol'):
                continue
            
            # parse filename components
            components = parse_filename(sol_file)
            
            
            # extract and process the file
            zip_ref.extract(sol_file, 'temp_extract')
            temp_path = os.path.join('temp_extract', sol_file)
            
            try:
                # get centroids

                if components['method'] == 'discrete':
                    # order of ballots here matters
                    centroids = get_centroids(temp_path, True, components['proxy'], list(components['proxy_conversion'].values()))
                else:
                    centroids = get_centroids(temp_path, False, components['proxy'], None)
                


                # add to results
                result_entry = {
                    'election': components['election'],
                    'method': components['method'],
                    'proxy': components['proxy'],
                    'num_candidates': components['num_candidates'],
                    'num_clusters': components['num_clusters'],
                    'centroids': centroids,
                    'ballots': assign_ballots(centroids, components['original_ballots'], components['proxy_conversion'])
                }
                results_data.append(result_entry)
                
            finally:
                # clean up
                os.remove(temp_path)
                
# clean up temp directory
if os.path.exists('temp_extract'):
    os.rmdir('temp_extract')

results_df = pd.DataFrame(results_data)

Processing 05_cand_result.zip...
Processing 04_cand_result.zip...
Processing 03_cand_result.zip...
Processing 06_cand_result.zip...
Processing 09_cand_result.zip...
Processing 07_cand_result.zip...
Processing 08_cand_result.zip...


In [204]:
results_df.query('election == "edinburgh_2017_ward2" and method == "continuous_rest" and proxy == "bordaa"')['centroids'].values

array([{0: (6.0, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5), 1: (2.5, 2.5, 6.0, 2.5, 2.5, 2.5, 2.5)}],
      dtype=object)

In [222]:
results_df

Unnamed: 0,election,method,proxy,num_candidates,num_clusters,centroids,ballots
0,aberdeen_2022_ward11,continuous,bordaa,5,2,"{0: (4.0, 0.5, 1.5, 2.0, 1.5), 1: (1.0, 3.0, 2...","{0: {(3, 1, 5): 22.0, (4, 3): 1.0, (1, 2, 3, 5..."
1,aberdeenshire_2012_ward10,continuous,bordaa,5,2,"{0: (1.5, 4.0, 1.5, 1.5, 1.5), 1: (3.0, 1.0, 3...","{0: {(1, 2): 8.0, (1, 2, 4): 4.0, (1, 2, 4, 3,..."
2,aberdeenshire_2012_ward1,continuous,bordaa,5,2,"{0: (1.0, 1.0, 3.0, 1.5, 4.0), 1: (3.0, 3.0, 1...","{0: {(1, 3, 5): 7.0, (1, 3, 5, 2): 1.0, (1, 3,..."
3,aberdeenshire_2012_ward13,continuous,bordaa,5,2,"{0: (3.0, 3.0, 1.0, 1.0, 1.0), 1: (1.0, 1.0, 3...","{0: {(1,): 10.0, (1, 2): 273.0, (1, 2, 3): 40...."
4,aberdeenshire_2012_ward15,continuous,bordaa,5,2,"{0: (1.5, 4.0, 1.5, 1.5, 1.5), 1: (3.0, 0.5, 3...","{0: {(1, 2): 13.0, (1, 2, 3): 16.0, (1, 2, 3, ..."
...,...,...,...,...,...,...,...
8390,west_lothian_2017_ward4,discrete,hh,8,2,"{0: (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, ...","{0: {(1,): 138.0, (1, 2): 19.0, (1, 2, 3): 6.0..."
8391,west_lothian_2017_ward5,discrete,hh,8,2,"{0: (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, -...","{0: {(1,): 55.0, (1, 2): 1.0, (1, 2, 3): 2.0, ..."
8392,west_lothian_2022_ward3,discrete,hh,8,2,"{0: (1.0, 1.0, 1.0, -1.0, 1.0, -1.0, 1.0, 0.0,...","{0: {(2, 7, 1): 1.0, (5, 7, 3, 6, 2, 8, 4, 1):..."
8393,west_lothian_2022_ward6,discrete,hh,8,2,"{0: (-1.0, 0.0, -1.0, 0.0, 0.0, 0.0, 0.0, 1.0,...","{0: {(2, 7, 1): 4.0, (2, 4, 1, 5, 7, 6, 8): 1...."


In [None]:
results_df.to_pickle('outdated_2_cluster_results.pkl')

### Feb 25 Runs

In [4]:
def parse_filename_feb25(filename: str, base_directory: str = "scot-elex-main") -> dict:
    """
    Parse a solution filename into its components

    Args:
    filename: str
        The filename to parse
    base_directory: str
        The directory containing the election data csv files
    """
    parts = filename.replace('.sol', '').split('_')
    
    # Handle the special case of 'continuous_rest'
    if parts[0] == 'continuous' and parts[1] == 'rest':
        method = 'continuous_rest'
        proxy = parts[2].lower()
        election_parts = parts[3:-2]
    else:
        method = parts[0]
        proxy = parts[1].lower()
        election_parts = parts[2:-2]
    
    num_clusters = int(parts[-1])
    num_candidates = 7
    election = '_'.join(election_parts)

    # Get raw ballots from csv in base_directory
    csv_path = os.path.join(base_directory, f"{election}.csv")
    _, election_data, _, _ = csv_parse(csv_path)
    # Process ballots based on proxy type
    processed_ballots = {}
    
    if proxy == 'hh':
        for ballot, count in election_data.items():
            processed_ballot = 2*HH_proxy(ballot, num_candidates)
            processed_ballots[ballot] = tuple(processed_ballot)
            
    elif proxy.startswith('borda'):
        borda_style = 'avg' if proxy == 'bordaa' else 'pes'
        
        for ballot, count in election_data.items():
            processed_ballot = Borda_vector(ballot, num_candidates, borda_style=borda_style)
            processed_ballots[ballot] = tuple(processed_ballot)
    return {
        'method': method,
        'proxy': proxy,
        'election': election,
        'num_candidates': int(num_candidates),
        'num_clusters': int(num_clusters),
        'original_ballots': election_data,
        'proxy_conversion': processed_ballots
    }

In [7]:
results_data = []
directory = "feb_25_runs/3_cluster_solutions"

for filename in os.listdir(directory):
    if not filename.endswith('.sol'):
        continue
    components = parse_filename_feb25(filename)

    path = os.path.join(directory, filename)
    
    if components['method'] == 'discrete':
        # order of ballots here matters
        centroids = get_centroids(path, True, components['proxy'], list(components['proxy_conversion'].values()))
    else:
        centroids = get_centroids(path, False, components['proxy'], None)
    
    print(components['election'])
    # add to results
    result_entry = {
        'election': components['election'],
        'method': components['method'],
        'proxy': components['proxy'],
        'num_candidates': components['num_candidates'],
        'num_clusters': components['num_clusters'],
        'centroids': centroids,
        'ballots': assign_ballots(centroids, components['original_ballots'], components['proxy_conversion'])
    }
    results_data.append(result_entry)


results_df = pd.DataFrame(results_data)

moray_2017_ward5
highland_2012_ward5
highland_2012_ward2
east_lothian_2017_ward5
dumgal_2017_ward6
inverclyde_2012_ward5
glasgow_2017_ward23
south_lanarkshire_2022_ward5
renfrewshire_2017_ward10
renfrewshire_2022_ward7
dundee_2017_ward5
west_dunbartonshire_2017_ward1
highland_2022_aird_and_loch_ness
aberdeen_2012_ward9
glasgow_2017_ward12
inverclyde_2022_ward5
argyll_bute_2012_ward1
inverclyde_2017_ward5
south_lanarkshire_2022_ward10
highland_2017_ward11
argyll_bute_2017_ward8
fife_2017_ward4
eilean_siar_2022_ward10
argyll_bute_2012_ward2
eilean_siar_2012_ward5
south_ayrshire_2012_ward4
highland_2012_ward16
highland_2012_ward11
midlothian_2017_ward2
north_ayrshire_2017_ward9
argyll_bute_2017_ward9
north_lanarkshire_2017_ward6
highland_2012_ward6
aberdeenshire_2017_ward8
north_lanarkshire_2017_ward17
east_ayrshire_2022_ward3
highland_2022_inverness_south
argyll_bute_2012_ward9
midlothian_2012_ward5
renfrewshire_2012_ward3


In [8]:
results_df.to_pickle('3_cluster_results_feb25.pkl')

This last part is collecting all the edin_17_2 results into one csv.

In [5]:
old_2_cluster_results = pd.read_pickle('2_cluster_results.pkl')

In [11]:
edin_17_2 = old_2_cluster_results[old_2_cluster_results['election'] == 'edinburgh_2017_ward2']

In [None]:
edin_17_2 = edin_17_2[edin_17_2['proxy'] != 'bordaa']

In [56]:
ELECTION_DIR = 'scot-elex-main'
election_data = csv_parse(f'{ELECTION_DIR}/edinburgh_2017_ward2.csv')[1]

import ip_models

cont_hh = ip_models.continuous_rest_hh(election_data=election_data, num_clusters=2)

In [10]:
cont_hh.optimize()

Gurobi Optimizer version 11.0.2 build v11.0.2rc0 (mac64[arm] - Darwin 24.1.0 24B5035e)

CPU model: Apple M2 Max
Thread count: 12 physical cores, 12 logical processors, using up to 12 threads

Optimize a model with 2499 rows, 2868 columns and 60392 nonzeros
Model fingerprint: 0x38e53538
Variable types: 42 continuous, 2826 integer (2700 binary)
Coefficient statistics:
  Matrix range     [1e+00, 2e+04]
  Objective range  [1e+00, 1e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 2e+04]
Presolve removed 1469 rows and 1389 columns
Presolve time: 0.10s
Presolved: 1030 rows, 1479 columns, 30193 nonzeros
Variable types: 0 continuous, 1479 integer (1370 binary)
Found heuristic solution: objective 162177.00000

Root relaxation: objective 0.000000e+00, 249 iterations, 0.01 seconds (0.01 work units)

    Nodes    |    Current Node    |     Objective Bounds      |     Work
 Expl Unexpl |  Obj  Depth IntInf | Incumbent    BestBd   Gap | It/Node Time

     0     0    0.00000    0  10

In [15]:
import math
num_cands = 7
dim = math.comb(num_cands, 2)
z = analysis.extract_z_variables(cont_hh, range(dim), range(2), [-1, 0, 1])
centroids = analysis.extract_centroids(cont_hh, z, range(2), range(dim), [-1, 0, 1])

{'z[0,0,-1]': (0, 0, -1), 'z[0,0,0]': (0, 0, 0), 'z[0,0,1]': (0, 0, 1), 'z[0,1,-1]': (0, 1, -1), 'z[0,1,0]': (0, 1, 0), 'z[0,1,1]': (0, 1, 1), 'z[1,0,-1]': (1, 0, -1), 'z[1,0,0]': (1, 0, 0), 'z[1,0,1]': (1, 0, 1), 'z[1,1,-1]': (1, 1, -1), 'z[1,1,0]': (1, 1, 0), 'z[1,1,1]': (1, 1, 1), 'z[2,0,-1]': (2, 0, -1), 'z[2,0,0]': (2, 0, 0), 'z[2,0,1]': (2, 0, 1), 'z[2,1,-1]': (2, 1, -1), 'z[2,1,0]': (2, 1, 0), 'z[2,1,1]': (2, 1, 1), 'z[3,0,-1]': (3, 0, -1), 'z[3,0,0]': (3, 0, 0), 'z[3,0,1]': (3, 0, 1), 'z[3,1,-1]': (3, 1, -1), 'z[3,1,0]': (3, 1, 0), 'z[3,1,1]': (3, 1, 1), 'z[4,0,-1]': (4, 0, -1), 'z[4,0,0]': (4, 0, 0), 'z[4,0,1]': (4, 0, 1), 'z[4,1,-1]': (4, 1, -1), 'z[4,1,0]': (4, 1, 0), 'z[4,1,1]': (4, 1, 1), 'z[5,0,-1]': (5, 0, -1), 'z[5,0,0]': (5, 0, 0), 'z[5,0,1]': (5, 0, 1), 'z[5,1,-1]': (5, 1, -1), 'z[5,1,0]': (5, 1, 0), 'z[5,1,1]': (5, 1, 1), 'z[6,0,-1]': (6, 0, -1), 'z[6,0,0]': (6, 0, 0), 'z[6,0,1]': (6, 0, 1), 'z[6,1,-1]': (6, 1, -1), 'z[6,1,0]': (6, 1, 0), 'z[6,1,1]': (6, 1, 1), 'z[7,

In [None]:
cont_hh_centroids = {i: tuple(centroids[i]) for i in range(2)}

In [29]:
processed_ballots = {}
num_candidates = 7
for ballot, count in election_data.items():
            processed_ballot = 2*HH_proxy(ballot, num_candidates)
            processed_ballots[ballot] = tuple(processed_ballot)

In [58]:
cont_hh_clusters = assign_ballots(cont_hh_centroids,  election_data, processed_ballots)

In [None]:

full_cont_hh = get_centroids('edin_17_2_full_continuous.sol', False, 'hh')

In [34]:
full_cont_hh_clsuters = assign_ballots(full_cont_hh,  election_data, processed_ballots)

In [48]:
edin_17_2.at[4742, 'centroids'] = full_cont_hh
edin_17_2.at[4742, 'ballots'] = full_cont_hh_clsuters
edin_17_2.at[5585, 'centroids'] = cont_hh_centroids
edin_17_2.at[5585, 'ballots'] = cont_hh_clusters

In [51]:
edin_17_2.reset_index(inplace=True, drop=True)

In [53]:
edin_17_2.drop(6, inplace=True)

In [None]:
edin_17_2.to_csv('edin_17_2_solutions.csv')