In [None]:
#default_exp similarity

# Member similarity
> Computing the similarity of members of the Bundestag.

In [None]:
%load_ext autoreload
%autoreload 2

## Setup

In [1]:
#export
import pandas as pd
import typing
import tqdm
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
import pickle

## Computing similarities between members of parliament based on their votes

complicating factors:
- not every parlamentarian voted for all the available issues
- the union of issues voted on between parliamentarians may vary between all pairs of parliamentarian
- similarity metric: cosine, agreement (# of same votes for all shared issues)

In [2]:
%%time
df = pd.read_parquet('../votes.parquet')

CPU times: user 125 ms, sys: 188 ms, total: 312 ms
Wall time: 489 ms


In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
Wahlperiode,18,18,18,18,18
Sitzungnr,115,115,115,115,115
Abstimmnr,6,6,6,6,6
Fraktion/Gruppe,CDU/CSU,CDU/CSU,CDU/CSU,CDU/CSU,CDU/CSU
Name,Albani,Albsteiger,Altmaier,Auernhammer,Bär
Vorname,Stephan,Katrin,Peter,Artur,Dorothee
Titel,,,,,
ja,1,1,0,1,1
nein,0,0,0,0,0
Enthaltung,0,0,0,0,0


In [25]:
#export
VOTE_COLS = ['ja', 'nein', 'Enthaltung', 'ungültig', 'nichtabgegeben']
def get_squished_dataframe(df:pd.DataFrame, id_col:str='Bezeichnung',
                           feature_cols:typing.List[str]=VOTE_COLS,
                           topic_cols:typing.List=['date', 'title'],
                           other_cols:typing.List=None):
    
    other_cols = [] if other_cols is None else other_cols
    tmp = df.loc[:, [id_col] + feature_cols + other_cols]
    tmp['issue'] = df['date'].dt.date.apply(str) + ' ' + df['title']

    tmp = tmp.set_index([id_col, 'issue'] + other_cols)
    return (tmp[tmp == 1].stack()
            .reset_index()
            .drop(0,1)
            .rename(columns={f'level_{2+len(other_cols)}':'vote'}))

In [26]:
%%time
df_squished = get_squished_dataframe(df)

CPU times: user 406 ms, sys: 31.2 ms, total: 438 ms
Wall time: 410 ms


In [27]:
df_squished.head()

Unnamed: 0,Bezeichnung,issue,vote
0,Stephan Albani,2015-07-02 Subventionen für britisches Atomwe...,ja
1,Katrin Albsteiger,2015-07-02 Subventionen für britisches Atomwe...,ja
2,Peter Altmaier,2015-07-02 Subventionen für britisches Atomwe...,nichtabgegeben
3,Artur Auernhammer,2015-07-02 Subventionen für britisches Atomwe...,ja
4,Dorothee Bär,2015-07-02 Subventionen für britisches Atomwe...,ja


In [28]:
#hide
assert all([v in df_squished.columns for v in ['Bezeichnung', 'issue', 'vote']])

In [32]:
%%time
other_cols = ['date', 'Fraktion/Gruppe']
df_squished_extended = get_squished_dataframe(df, other_cols=other_cols)

CPU times: user 438 ms, sys: 0 ns, total: 438 ms
Wall time: 455 ms


In [33]:
df_squished_extended.head()

Unnamed: 0,Bezeichnung,issue,date,Fraktion/Gruppe,vote
0,Stephan Albani,2015-07-02 Subventionen für britisches Atomwe...,2015-07-02,CDU/CSU,ja
1,Katrin Albsteiger,2015-07-02 Subventionen für britisches Atomwe...,2015-07-02,CDU/CSU,ja
2,Peter Altmaier,2015-07-02 Subventionen für britisches Atomwe...,2015-07-02,CDU/CSU,nichtabgegeben
3,Artur Auernhammer,2015-07-02 Subventionen für britisches Atomwe...,2015-07-02,CDU/CSU,ja
4,Dorothee Bär,2015-07-02 Subventionen für britisches Atomwe...,2015-07-02,CDU/CSU,ja


In [34]:
#hide
assert len(df_squished) == len(df_squished_extended)
assert all([v in df_squished_extended.columns for v in ['Bezeichnung', 'issue', 'vote']])

In [35]:
#export
def get_agreements_painfully_slow(df:pd.DataFrame, 
                                  member0:str, member1:str, 
                                  verbose:bool=False,
                                  id_col:str='Bezeichnung'):
    #TODO: prettify & speed test the calculation
    members = df[id_col].unique()
    assert member0 in members, f'{member0} not found'
    assert member1 in members, f'{member1} not found'
    res = {}
    
    member0_mask = df[id_col] == member0
    member1_mask = df[id_col] == member1
    
    common_issues = set(df.loc[member0_mask,'issue'].values).intersection(df.loc[member1_mask,'issue'].values)

    common_issue_mask = df['issue'].isin(common_issues)
    votes0 = df.loc[member0_mask & common_issue_mask].sort_values('issue')
    votes1 = df.loc[member1_mask & common_issue_mask].sort_values('issue')
    n_issues = df.loc[common_issue_mask,'issue'].nunique()
    
    if n_issues == 0:
        return res
    
    agreement_frac = (votes0['vote'].values == votes1['vote'].values).sum() / n_issues
    if verbose: print(f'overall agreement {agreement_frac*100:.2f} %')
    
    res['overall_frac'] = agreement_frac
    res['overall_total'] = n_issues
    res['member0'] = member0
    res['member1'] = member1
    
    for outcome in df.loc[common_issue_mask,'vote'].unique():

        n_issues = df.loc[common_issue_mask & (df['vote']==outcome), 'issue'].nunique()
        issues0 = votes0.loc[votes0['vote']==outcome, 'issue'].unique()
        issues1 = votes1.loc[votes1['vote']==outcome, 'issue'].unique()
        n_agree = len(set(issues0).intersection(issues1))
        agreement_frac = n_agree / n_issues
        if verbose: print(f'"{outcome}" agreement {agreement_frac*100:.2f} %')
        res[f'{outcome}_frac'] = agreement_frac
        res[f'{outcome}_total'] = n_issues
    
    return res

In [36]:
members = df['Bezeichnung'].unique()
num_members = len(members)
members

array(['Stephan Albani', 'Katrin Albsteiger', 'Peter Altmaier', ...,
       'Dr. Hermann Ott', 'Dr. Dr. h.c. Bernd Fabritius',
       'Susanne Kickbusch'], dtype=object)

In [37]:
member0 = 'Peter Altmaier'
#member1 = 'Hubertus Heil (Peine)'
member1 = 'Dr. Angela Merkel'
assert member0 in members, f'{member0} not found'
assert member1 in members, f'{member1} not found'

In [38]:
%%time
get_agreements_painfully_slow(df_squished, member0, member1, verbose=True)

overall agreement 68.00 %
"ja" agreement 14.35 %
"nichtabgegeben" agreement 48.24 %
"nein" agreement 5.56 %
"Enthaltung" agreement 0.00 %
CPU times: user 203 ms, sys: 15.6 ms, total: 219 ms
Wall time: 203 ms


{'overall_frac': 0.68,
 'overall_total': 425,
 'member0': 'Peter Altmaier',
 'member1': 'Dr. Angela Merkel',
 'ja_frac': 0.14352941176470588,
 'ja_total': 425,
 'nichtabgegeben_frac': 0.4823529411764706,
 'nichtabgegeben_total': 425,
 'nein_frac': 0.05555555555555555,
 'nein_total': 414,
 'Enthaltung_frac': 0.0,
 'Enthaltung_total': 341}

In [39]:
df_squished.head()

Unnamed: 0,Bezeichnung,issue,vote
0,Stephan Albani,2015-07-02 Subventionen für britisches Atomwe...,ja
1,Katrin Albsteiger,2015-07-02 Subventionen für britisches Atomwe...,ja
2,Peter Altmaier,2015-07-02 Subventionen für britisches Atomwe...,nichtabgegeben
3,Artur Auernhammer,2015-07-02 Subventionen für britisches Atomwe...,ja
4,Dorothee Bär,2015-07-02 Subventionen für britisches Atomwe...,ja


General agreement
TODO: figure out how to do the relative ranking. one would need to count all the decisions which were the same as well as how many were different. the first part is a normal matrix product. the second part would be a matrix product with and "or" instead of the "and" condition of the normal matrix product. not sure how to do this yet

In [None]:
#export
def get_dummy(df:pd.DataFrame, mask:pd.Series):
    return (df.loc[mask]
            .assign(dummy=True)
            .pivot_table(index='Bezeichnung', columns='issue', values='dummy', fill_value=False)
            .astype(bool))

def scan_all_agreements(df:pd.DataFrame):
    outcomes = df['vote'].unique()
    agreements = {}
    for outcome in tqdm.tqdm(outcomes, desc='Outcome', total=len(outcomes)):
        mask = df['vote']==outcome
        tmp = get_dummy(df, mask=mask)
        members = tmp.index.values
        similarity = 1 - pairwise_distances(tmp.values, metric='jaccard')
        similarity = 100 * similarity
        agreements[outcome] = pd.DataFrame(similarity, columns=members, index=members)
    
    tmp = get_dummy(df, df['vote'].notna()).astype(float)
    members = tmp.index.values
    tmp = np.dot(tmp.values, tmp.values.T)
    agreements['total_shared_votes'] = pd.DataFrame(tmp, 
                                                    columns=members, 
                                                    index=members)
    
    return agreements

In [None]:
%%time
agreements = scan_all_agreements(df_squished)

In [None]:
assert all([v in agreements for v in df_squished['vote'].unique()])
assert 'total_shared_votes' in agreements
assert all([isinstance(v, pd.DataFrame) for v in agreements.values()])

In [None]:
agreements

Saving results

In [None]:
with open('../similarities.pkl', 'wb') as f:
    pickle.dump(agreements, f)