## Example: Pre-Compute Gower Weights

This Notebook shows how to create the co-sponsorship variable, which is then used to compute Gower weights. 


In this example counts are created period-wise. Depending on the downstream modelling task, one has to create other splits than just period-wise. 

For example, when running feed-forward CV, simply using period-wise weights would lead to data leakage.

Once the target variable is created one can compute the Gower weights. Here, we just do it for the 49th period.

### Import Modules

In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))
from data_loading import load_data
from helpers import get_gower_weights

import numpy as np
import pandas as pd
from itertools import combinations

### Compute Period-Wise Cosponsorship Counts

In [2]:
# initialize for loop output
all_cosponsorships = []

# legislative period-wise
periods = [49,50,51,52]

for period in periods:
    # load data
    votes, affairs, councillors = load_data([period])

    # Get unique councillors
    councillors_unique = sorted(votes['elanId'].dropna().unique())

    # All possible unordered pairs
    all_possible_pairs = set(combinations(councillors_unique, 2))

    # Co-voted pairs
    co_voted_pairs = {
        combo
        for _, group in votes.groupby("id")
        for combo in combinations(sorted(group['elanId'].dropna().unique()), 2)
    }

    # Pairs who never voted together
    never_voted_together = all_possible_pairs - co_voted_pairs

    # Convert to DataFrame
    never_df = pd.DataFrame(never_voted_together, columns=["elanId_1", "elanId_2"])

    # create councillor & affair id to matrix idx mappings
    ordered_c_ids = sorted(set(councillors['elanId']))
    c_id2idx = {id_: i for i, id_ in enumerate(ordered_c_ids)}
    idx2c_id = {i: id_ for id_, i in c_id2idx.items()}  # reverse mapping

    # councillor amount
    L = len(c_id2idx)

    # initialize count matrix
    C = np.zeros((L,L))

    # filter out affairs not authored by councillors (these are [])
    councillor_affairs = affairs[affairs['cosign_author_elanId'].apply(lambda x: len(x) > 0)]

    # fill matrix with cosponsorhsip counts
    for _, row in councillor_affairs.iterrows(): 

        # ensure that numpy arrays in each row contain unique ids which are int
        cosponsors = list(set([int(x) for x in row['cosign_author_elanId']]))

        # map elanIds in cosponsorship lists to matrix indices (if elanId is in councillors df)
        indices = [c_id2idx[x] for x in cosponsors if x in councillors['elanId'].values]

        # For each pair of co-sponsors, increment the corresponding entry in matrix C
        for i in range(len(indices)):
            for j in range(len(indices)):
                    C[indices[i], indices[j]] += 1

    # null out diagonal
    np.fill_diagonal(C,0)

    # Convert matrix to long-form DataFrame (only upper triangle, count > 0)
    data = []
    for i in range(L):
        for j in range(i + 1, L):  # skip diagonal and lower triangle
            count = C[i, j]
            data.append({
                'elanId_1': idx2c_id[i],
                'elanId_2': idx2c_id[j],
                'count': int(count)
            })

    cosponsorship_df = pd.DataFrame(data)

    # remove impossible combinations
    never_pairs = set(map(tuple, never_df[['elanId_1', 'elanId_2']].values))

    # Filter cosponsorship_df to exclude those pairs
    cosponsorship_df = cosponsorship_df[
        ~cosponsorship_df[['elanId_1', 'elanId_2']].apply(tuple, axis=1).isin(never_pairs)
    ].reset_index(drop=True)

    # add period & append
    cosponsorship_df['legislative_period'] = period
    all_cosponsorships.append(cosponsorship_df)

# combine everything
final_df = pd.concat(all_cosponsorships, ignore_index=True)

# export
final_df.to_csv('../data/gower_weights/cospon_count_by_period.csv', index=False)

### Compute Gower Weights for the 49th Legislation

In [None]:

# reload main data (to be filtered correctly)
period = [49]

votes, affairs, councillors = load_data(period)

# load just created periodwise counts
target = pd.read_csv('../data/clean/cospon_count_by_period.csv')

# filter for 49th period
target = target[target['legislative_period'] == period[0]]

# Define features
features = [
    'degree_class', 
    'profession_class', 
    'gender',
    'average_age', 
    'lang_region', 
    'military_rank_ordinal', 
    'faction_ordinal'
]

# get gower weights: last arg controlls size of random share of data used to compute SHAP values (the higher, the longer it takes)
result = get_gower_weights(councillors, features, target, 'randomForestReg', 0.25) 

# save Gower weights
result.to_csv(f'../data/gower_weights/gower_weights_{period[0]}.csv')