In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import gzip

# Loading voting data

In [2]:
DATA_FOLDER = "../data/"
GENERATED_FOLDER = "../generated/"

In [3]:
SOURCE_VOTE_DATA_DTYPES = {
    'AffairShortId': np.int, 
    'AffairTitle': np.str,
    'VoteRegistrationNumber': np.int, 
    'VoteDate': np.str,
    'VoteMeaningYes': np.str,
    'VoteMeaningNo': np.str, 
    'DivisionText': np.str, 
    'VoteSubmissionText': np.str,
    'VoteFilteredYes': np.uint8, 
    'VoteFilteredNo': np.uint8, 
    'VoteFilteredAbstain': np.uint8,
    'VoteFilteredNotParticipated': np.uint8, 
    'VoteFilteredExcused': np.uint8,
    'VoteFilteredPresident': np.uint8, 
    'CouncillorId': np.int, 
    'CouncillorName': np.str,
    'CouncillorYes': np.uint0, 
    'CouncillorNo': np.uint0, 
    'CouncillorAbstain': np.uint0,
    'CouncillorNotParticipated': np.uint0, 
    'CouncillorExcused': np.uint0,
    'CouncillorPresident': np.uint0
}

We need some custom functions because the source file is not properly formatted.

In [4]:
def read_file(name):
    path = DATA_FOLDER + name + ".csv.gz"
    with gzip.open(path, 'rt') as file:
        for line in file:
            columns = line.split('","')
            yield [column.strip(u'"\n\ufeff') for column in columns]

            
def load(name):
    rows = np.array(list(read_file(name)))
    df = pd.DataFrame(rows[1:], columns=rows[0])
    return df


def load_all():
    dfs = []
    for year in range(2007, 2020):
        dfs.append(load(str(year)))
    df = pd.concat(dfs, ignore_index=True, copy=False).astype(SOURCE_VOTE_DATA_DTYPES)
    return df

In [5]:
vote_data = load_all()

## Formatting Dates

In [6]:
SOURCE_VOTE_DATA_DATE_FORMAT = '%a %b %d %Y %H:%M:%S %Z'

In [7]:
vote_data['VoteDate'] = pd.to_datetime(vote_data['VoteDate'].str[:28], format=SOURCE_VOTE_DATA_DATE_FORMAT)

## Saving Dataframe

In [8]:
vote_data.to_csv(GENERATED_FOLDER + 'vote_data.csv.gz', index=False)

## Reading Dataframe

In [9]:
VOTE_DATA_DTYPES = {
    'AffairShortId': np.int, 
    'AffairTitle': np.str,
    'VoteRegistrationNumber': np.int, 
    'VoteMeaningYes': np.str,
    'VoteMeaningNo': np.str, 
    'DivisionText': np.str, 
    'VoteSubmissionText': np.str,
    'VoteFilteredYes': np.uint8, 
    'VoteFilteredNo': np.uint8, 
    'VoteFilteredAbstain': np.uint8,
    'VoteFilteredNotParticipated': np.uint8, 
    'VoteFilteredExcused': np.uint8,
    'VoteFilteredPresident': np.uint8, 
    'CouncillorId': np.int, 
    'CouncillorName': np.str,
    'CouncillorYes': np.uint0, 
    'CouncillorNo': np.uint0, 
    'CouncillorAbstain': np.uint0,
    'CouncillorNotParticipated': np.uint0, 
    'CouncillorExcused': np.uint0,
    'CouncillorPresident': np.uint0
}

VOTE_DATA_DATE_COLUMNS = ['VoteDate']

In [150]:
vote_data = pd.read_csv(GENERATED_FOLDER + 'vote_data.csv.gz', dtype=VOTE_DATA_DTYPES, parse_dates=VOTE_DATA_DATE_COLUMNS, date_parser=dt.datetime.fromisoformat)
vote_data.head()

Unnamed: 0,AffairShortId,AffairTitle,VoteRegistrationNumber,VoteDate,VoteMeaningYes,VoteMeaningNo,DivisionText,VoteSubmissionText,VoteFilteredYes,VoteFilteredNo,...,VoteFilteredExcused,VoteFilteredPresident,CouncillorId,CouncillorName,CouncillorYes,CouncillorNo,CouncillorAbstain,CouncillorNotParticipated,CouncillorExcused,CouncillorPresident
0,20070464,Prorogation de la loi fédérale sur l'adaptatio...,248,2007-12-21 09:50:38+00:00,,,Ja,Vote final,196,1,...,0,1,3923,Marra Ada,1,0,0,0,0,0
1,20070464,Prorogation de la loi fédérale sur l'adaptatio...,248,2007-12-21 09:50:38+00:00,,,Ja,Vote final,196,1,...,0,1,3883,Glauser-Zufferey Alice,1,0,0,0,0,0
2,20070464,Prorogation de la loi fédérale sur l'adaptatio...,248,2007-12-21 09:50:38+00:00,,,Ja,Vote final,196,1,...,0,1,3907,Thorens Goumaz Adèle,1,0,0,0,0,0
3,20070464,Prorogation de la loi fédérale sur l'adaptatio...,248,2007-12-21 09:50:38+00:00,,,Ja,Vote final,196,1,...,0,1,3913,Wyss Brigit,1,0,0,0,0,0
4,20070464,Prorogation de la loi fédérale sur l'adaptatio...,248,2007-12-21 09:50:38+00:00,,,Ja,Vote final,196,1,...,0,1,3878,Flückiger-Bäni Sylvia,1,0,0,0,0,0


# Loading Members Data

In [11]:
SOURCE_MEMBERS_DTYPES = {
    'Active': np.bool,
    'FirstName': np.str,
    'LastName': np.str,
    'GenderAsString': np.str,
    'CantonName': np.str,
    'CantonAbbreviation': np.str,
    'CouncilName': np.str,
    'ParlGroupName': np.str,
    'ParlGroupAbbreviation': np.str,
    'PartyName': np.str,
    'PartyAbbreviation': np.str,
    'MaritalStatusText': np.str,
    'BirthPlace_City': np.str,
    'BirthPlace_Canton': np.str,
    'Mandates': np.str,
    'Citizenship': np.str,
    'CouncillorName': np.str
}

SOURCE_MEMBERS_DATE_COLUMNS = ['DateJoining', 'DateLeaving', 'DateOfBirth', 'DateOfDeath']
SOURCE_MEMBERS_DATE_FORMAT = "%M/%d/%Y"

In [12]:
source_members = pd.read_excel(DATA_FOLDER + 'Ratsmitglieder_1848_EN.xlsx', dtype=SOURCE_MEMBERS_DTYPES, parse_dates=SOURCE_MEMBERS_DATE_COLUMNS, date_format=SOURCE_MEMBERS_DATE_FORMAT)
for column in SOURCE_MEMBERS_DATE_COLUMNS:
    source_members[column] = source_members[column].dt.date
source_members.head()

Unnamed: 0,Active,FirstName,LastName,GenderAsString,CantonName,CantonAbbreviation,CouncilName,ParlGroupName,ParlGroupAbbreviation,PartyName,PartyAbbreviation,MaritalStatusText,BirthPlace_City,BirthPlace_Canton,Mandates,DateJoining,DateLeaving,Citizenship,DateOfBirth,DateOfDeath
0,False,Giuseppe,a Marca,m,Grisons,GR,Conseil des Etats,Centre,MC,Conservateurs,Cons*,,,,,1849-12-01,1851-07-01,Soazza (GR),1799-07-29,1866-07-16
1,False,Alois,Ab Yberg,m,Schwyz,SZ,Conseil national,Groupe radical-démocratique,R,Parti radical-démocratique suisse,PRD,,,,,1928-12-03,1935-12-01,Schwyz (SZ),1878-10-06,1959-10-17
2,False,Fabio,Abate,m,Tessin,TI,Conseil national,Groupe radical-libéral,RL,Parti radical-démocratique suisse,PRD,,Locarno,Tessin,Incarichi esecutivi presso il comune di Locarn...,2007-12-03,2011-12-04,Cabbio (TI),1966-01-04,NaT
3,False,Fabio,Abate,m,Tessin,TI,Conseil national,Groupe radical-libéral,R,Parti radical-démocratique suisse,PRD,,Locarno,Tessin,Incarichi esecutivi presso il comune di Locarn...,2000-09-25,2003-11-30,Cabbio (TI),1966-01-04,NaT
4,False,Fabio,Abate,m,Tessin,TI,Conseil des Etats,Groupe libéral-radical,RL,PLR.Les Libéraux-Radicaux,PLR,,Locarno,Tessin,Incarichi esecutivi presso il comune di Locarn...,2011-12-05,2015-11-29,Cabbio (TI),1966-01-04,NaT


## Removing Entries

We want to remove all councillors that did not participate in any votes in the votes dataframe. We only want councillors in the national council and members that were active during the period in question. First we check whether there is any data missing:

In [13]:
source_members.isna().any()

Active                   False
FirstName                False
LastName                 False
GenderAsString           False
CantonName                True
CantonAbbreviation        True
CouncilName               True
ParlGroupName             True
ParlGroupAbbreviation     True
PartyName                 True
PartyAbbreviation         True
MaritalStatusText         True
BirthPlace_City           True
BirthPlace_Canton         True
Mandates                  True
DateJoining              False
DateLeaving               True
Citizenship               True
DateOfBirth              False
DateOfDeath               True
dtype: bool

### Removing By Date

We have all the data for when the people joined but some data is missing in the 'DateLeaving' column. It turns out that all entries with missing 'DateLeaving' values have a 'DateJoining' value after 2007. So we want to keep them. See next cell for analysis:

In [14]:
source_members.loc[source_members['DateLeaving'].isna(), 'DateJoining'].min()

datetime.date(2015, 11, 30)

We can now discard all entries for councillors which left before the first vote in the other dataset:

In [15]:
did_not_leave = source_members['DateLeaving'].isna()
left_after_first_vote = source_members['DateLeaving'] > vote_data['VoteDate'].dt.date.min()

In [16]:
members = source_members[did_not_leave | left_after_first_vote].copy()

### Checking Whether Data Is Missing In Members

In [17]:
def full_names(df):
    return df['LastName'] + " " + df['FirstName']

In [18]:
missing_names = set(vote_data['CouncillorName']) - set(full_names(members))
missing_names

{'Bernasconi Maria', 'Bignasca Giuliano', 'Imfeld Adriano'}

Three people are missing in the members dataset that are present in the votes dataframe.

- The first one has just an additional family name as can be found on google: https://de.wikipedia.org/wiki/Maria_Roth-Bernasconi
- The third one is called 'Adrian' instead of 'Adriano' in the members dataset.
- The second one ('Bignasca Giuliano') participated in votes in 2007 but was only in the national council until 2003 (according to the members dataframe and Wikipedia). It is not clear whether the error is in the members dataframe or the votes one. We decided to add another entry for him in the members dataframe that covers the period between his votes in 2007.


### Renaming Bernasconi Maria and Imfeld Adriano

In [19]:
vote_data.loc[vote_data['CouncillorName'] == 'Bernasconi Maria', 'CouncillorName'] = 'Roth-Bernasconi Maria'
members.loc[full_names(members) == 'Imfeld Adrian', 'FirstName'] = 'Adriano'

Only 'Bignasca Giuliano' should be missing now:

In [20]:
missing_names = set(vote_data['CouncillorName']) - set(full_names(members))
missing_names

{'Bignasca Giuliano'}

### Adding Bignasca Giuliano

Proof that there are no additional/missing values for the votes that Bignasca Giuliano took part in. For each vote we count how many entries there are in the dataframe. The count should be divisible by 200 (one entry for each councillor).

In [21]:
votes = vote_data.loc[vote_data['CouncillorName'] == 'Bignasca Giuliano', 'AffairShortId']
vote_counts = vote_data.loc[vote_data['AffairShortId'].isin(votes.unique()), ['AffairShortId', 'VoteDate']].groupby('AffairShortId').count()
(vote_counts % 2 != 0).any().values[0]

False

Adding information about Bignasca Giuliano to the members dataframe for the votes he participated in:

In [22]:
bignasca_vote_data = vote_data[vote_data['CouncillorName'] == 'Bignasca Giuliano'].drop_duplicates('AffairShortId')
bignasca = source_members[(source_members['FirstName'] == 'Giuliano') & (source_members['LastName'] == 'Bignasca')].sort_values('DateLeaving', ascending=False).iloc[0].copy()
bignasca['DateJoining'] = bignasca_vote_data['VoteDate'].dt.date.min()
bignasca['DateLeaving'] = bignasca_vote_data['VoteDate'].dt.date.max()
members = members.append(bignasca).reset_index(drop=True)

### Removing Members From Other Councils

In [23]:
national_council_members = members[members['CouncilName'] == 'Conseil national']
missing = set(vote_data['CouncillorName']) - set(full_names(national_council_members))
missing

{'Bruderer Wyss Pascale', 'Diener Lenz Verena'}

Ok, some more people went missing.

- 'Bruderer Wyss Pascale' was called 'Bruderer Pascale' while in the national council but is called 'Bruderer Wyss Pascale' in the council of states.
- 'Diener Lenz Verena' was called 'Diener Verena' while in the national council but is called 'Diener Lenz Verena' in the council of states.

## Fixing Bruderer Wyss Pascale and Diener Lenz Verena

Just update the family names

In [24]:
members.loc[(members['FirstName'] == 'Pascale') & (members['LastName'] == 'Bruderer'), 'LastName'] = 'Bruderer Wyss'
members.loc[(members['FirstName'] == 'Verena') & (members['LastName'] == 'Diener'), 'LastName'] = 'Diener Lenz'

There should be no missing entries anymore

In [25]:
national_council_members = members[members['CouncilName'] == 'Conseil national'].reset_index(drop=True).drop_duplicates()
missing = set(vote_data['CouncillorName']) - set(full_names(national_council_members))
missing

set()

## Saving Members

In [26]:
members.to_csv(GENERATED_FOLDER + 'members.csv', index=False)

## Saving National Council Members

In [27]:
national_council_members.to_csv(GENERATED_FOLDER + 'national_council_members.csv', index=False)

## Reading Members

In [28]:
MEMBERS_DTYPES = {
    'Active': np.bool,
    'FirstName': np.str,
    'LastName': np.str,
    'GenderAsString': np.str,
    'CantonName': np.str,
    'CantonAbbreviation': np.str,
    'CouncilName': np.str,
    'ParlGroupName': np.str,
    'ParlGroupAbbreviation': np.str,
    'PartyName': np.str,
    'PartyAbbreviation': np.str,
    'MaritalStatusText': np.str,
    'BirthPlace_City': np.str,
    'BirthPlace_Canton': np.str,
    'Mandates': np.str,
    'Citizenship': np.str,
    'CouncillorName': np.str
}

MEMBERS_DATE_COLUMNS = ['DateJoining', 'DateLeaving', 'DateOfBirth', 'DateOfDeath']

In [29]:
def read_members_like(name):
    df = pd.read_csv(GENERATED_FOLDER + name, parse_dates=MEMBERS_DATE_COLUMNS)
    for column in MEMBERS_DATE_COLUMNS:
        df[column] = df[column].dt.date
    return df

In [30]:
members = read_members_like('members.csv')
members.head()

Unnamed: 0,Active,FirstName,LastName,GenderAsString,CantonName,CantonAbbreviation,CouncilName,ParlGroupName,ParlGroupAbbreviation,PartyName,PartyAbbreviation,MaritalStatusText,BirthPlace_City,BirthPlace_Canton,Mandates,DateJoining,DateLeaving,Citizenship,DateOfBirth,DateOfDeath
0,False,Fabio,Abate,m,Tessin,TI,Conseil national,Groupe radical-libéral,RL,Parti radical-démocratique suisse,PRD,,Locarno,Tessin,Incarichi esecutivi presso il comune di Locarn...,2007-12-03,2011-12-04,Cabbio (TI),1966-01-04,NaT
1,False,Fabio,Abate,m,Tessin,TI,Conseil des Etats,Groupe libéral-radical,RL,PLR.Les Libéraux-Radicaux,PLR,,Locarno,Tessin,Incarichi esecutivi presso il comune di Locarn...,2011-12-05,2015-11-29,Cabbio (TI),1966-01-04,NaT
2,True,Fabio,Abate,m,Tessin,TI,Conseil des Etats,Groupe libéral-radical,RL,PLR.Les Libéraux-Radicaux,PLR,,Locarno,Tessin,Incarichi esecutivi presso il comune di Locarn...,2015-11-30,2019-12-01,Cabbio (TI),1966-01-04,NaT
3,False,Fabio,Abate,m,Tessin,TI,Conseil national,Groupe radical-libéral,R,Parti radical-démocratique suisse,PRD,,Locarno,Tessin,Incarichi esecutivi presso il comune di Locarn...,2003-12-01,2007-12-02,Cabbio (TI),1966-01-04,NaT
4,True,Jean-Luc,Addor,m,Valais,VS,Conseil national,Groupe de l'Union démocratique du Centre,V,Union Démocratique du Centre,UDC,,Lausanne,Vaud,Député au Grand Conseil: 2005-2015; Conseiller...,2015-11-30,2019-12-01,"Ste-Croix (VD),Savièse (VS)",1964-04-22,NaT


## Reading National Council Members

In [31]:
national_council_members = read_members_like('national_council_members.csv')
national_council_members.head()

Unnamed: 0,Active,FirstName,LastName,GenderAsString,CantonName,CantonAbbreviation,CouncilName,ParlGroupName,ParlGroupAbbreviation,PartyName,PartyAbbreviation,MaritalStatusText,BirthPlace_City,BirthPlace_Canton,Mandates,DateJoining,DateLeaving,Citizenship,DateOfBirth,DateOfDeath
0,False,Fabio,Abate,m,Tessin,TI,Conseil national,Groupe radical-libéral,RL,Parti radical-démocratique suisse,PRD,,Locarno,Tessin,Incarichi esecutivi presso il comune di Locarn...,2007-12-03,2011-12-04,Cabbio (TI),1966-01-04,NaT
1,False,Fabio,Abate,m,Tessin,TI,Conseil national,Groupe radical-libéral,R,Parti radical-démocratique suisse,PRD,,Locarno,Tessin,Incarichi esecutivi presso il comune di Locarn...,2003-12-01,2007-12-02,Cabbio (TI),1966-01-04,NaT
2,True,Jean-Luc,Addor,m,Valais,VS,Conseil national,Groupe de l'Union démocratique du Centre,V,Union Démocratique du Centre,UDC,,Lausanne,Vaud,Député au Grand Conseil: 2005-2015; Conseiller...,2015-11-30,2019-12-01,"Ste-Croix (VD),Savièse (VS)",1964-04-22,NaT
3,False,Jean-Luc,Addor,m,Valais,VS,Conseil national,,,Union Démocratique du Centre,UDC,,Lausanne,Vaud,Député au Grand Conseil: 2005-2015; Conseiller...,2019-12-02,NaT,"Ste-Croix (VD),Savièse (VS)",1964-04-22,NaT
4,False,Andreas,Aebi,m,Berne,BE,Conseil national,Groupe de l'Union démocratique du Centre,V,Union Démocratique du Centre,UDC,,Burgdorf,Berne,Legislative der Gemeinde: seit Januar 2000; Ge...,2007-12-03,2011-12-04,Wynigen (BE),1958-11-26,NaT


# Joining The Datasets

In [32]:
full_votes = vote_data.join(national_council_members.set_index(full_names(national_council_members)), on='CouncillorName')
full_votes = full_votes[(full_votes['VoteDate'] >= full_votes['DateJoining']) & (full_votes['VoteDate'] <= full_votes['DateLeaving'])]

Check if we found information for all votes

In [33]:
len(full_votes.index ^ vote_data.index) == 0

False

Some votes did not match with any active period of the respective council members. To fix this we just copy the information from different active periods of the councillors in questions. First we check whether the information of all members stays the same over all their entries in the members dataframe:

In [34]:
# find missing council members
missing_names = vote_data.loc[vote_data.index ^ full_votes.index]['CouncillorName'].values

# only select columns that we care about
columns = national_council_members.columns.drop(['DateJoining', 'DateLeaving', 'Active'])

# select all members and group their respective entries for each member
grouped_missing = national_council_members.loc[full_names(national_council_members).isin(missing_names), columns].groupby(['LastName', 'FirstName'])

# check if all entries are the same for each member
grouped_missing.nunique().max().max() == 1

True

Luckily this is true. Because all entries are the same (per member) we can just use the first entry as a template and add duplicates of it to the dataframe:

In [35]:
# get one entry per missing member
new_entries = national_council_members[full_names(national_council_members).isin(missing_names)].groupby(['FirstName', 'LastName']).first().reset_index()

# join with the vote_data dataframe
new_entries = vote_data.loc[vote_data.index ^ full_votes.index].join(new_entries.set_index(full_names(new_entries)), on='CouncillorName')

# create fake 'DateJoining' and 'DateLeaving' values
new_entries['DateJoining'] = new_entries['VoteDate'].dt.date
new_entries['DateLeaving'] = new_entries['VoteDate'].dt.date

# add entries to dataframe
full_votes = full_votes.append(new_entries, sort=True)

Test whether all votes are present in both dataframes

In [36]:
len(full_votes.index ^ vote_data.index) == 0

True

## Saving Full Votes

In [37]:
full_votes.to_csv(GENERATED_FOLDER + 'full_votes.csv.gz', index=False)

## Reading Full Votes

In [38]:
FULL_VOTES_DTYPES = {
    'Active': np.bool,
    'AffairShortId': np.int64,
    'AffairTitle': np.str,
    'BirthPlace_Canton': np.str,
    'BirthPlace_City': np.str,
    'CantonAbbreviation': np.str,
    'CantonName': np.str,
    'Citizenship': np.str,
    'CouncilName': np.str,
    'CouncillorAbstain': np.uint0,
    'CouncillorExcused': np.uint0,
    'CouncillorId': np.int64,
    'CouncillorName': np.str,
    'CouncillorNo': np.uint0,
    'CouncillorNotParticipated': np.uint0,
    'CouncillorPresident': np.uint0,
    'CouncillorYes': np.uint0,
    'DivisionText': np.str,
    'FirstName': np.str,
    'GenderAsString': np.str,
    'LastName': np.str,
    'Mandates': np.str,
    'MaritalStatusText': np.str,
    'ParlGroupAbbreviation': np.str,
    'ParlGroupName': np.str,
    'PartyAbbreviation': np.str,
    'PartyName': np.str,
    'VoteFilteredAbstain': np.uint8,
    'VoteFilteredExcused': np.uint8,
    'VoteFilteredNo': np.uint8,
    'VoteFilteredNotParticipated': np.uint8,
    'VoteFilteredPresident': np.uint8,
    'VoteFilteredYes': np.uint8,
    'VoteMeaningNo': np.str,
    'VoteMeaningYes': np.str,
    'VoteRegistrationNumber': np.int64,
    'VoteSubmissionText': np.str,
}

FULL_VOTES_DATE_COLUMNS = ['VoteDate', 'DateJoining', 'DateLeaving', 'DateOfBirth', 'DateOfDeath']

In [39]:
def read_full_votes(file_name='full_votes.csv.gz'):
    path = GENERATED_FOLDER + file_name
    df = pd.read_csv(path, dtype=FULL_VOTES_DTYPES, parse_dates=FULL_VOTES_DATE_COLUMNS)
    date_columns = [column for column in FULL_VOTES_DATE_COLUMNS if column != 'VoteDate']
    for column in date_columns:
        df[column] = df[column].dt.date
    return df

In [339]:
full_votes = read_full_votes()

# Creating 'Votes' Dataframe
The votes dataframe should have the VoteId as index and a column for each councillor with the values being the vote from the respective councillor. Votes have 6 possible values:

- 'Yes'
- 'No'
- 'Abstain'
- 'NotParticipated'
- 'Excused'
- 'President'

We use integer ids to encode these values as:

- 'No': 0
- 'Yes': 1
- 'Abstain': 2
- 'NotParticipated': 3
- 'Excused': 4
- 'President': 5

Missing values (the councillor was not in the national council when the vote happend) are encoded as -1.

Test to make sure that no data is missing and our assumptions that at exactly one of the possible values is present:

In [340]:
columns = ['CouncillorYes', 'CouncillorNo', 'CouncillorAbstain', 'CouncillorNotParticipated', 'CouncillorExcused', 'CouncillorPresident']
row_wise_sum = full_votes.loc[:, columns].sum(axis=1)
row_wise_sum.min(), row_wise_sum.max()

(1, 1)

We see that max=1 and min=1, so there is exactly one value true in the columns.

Let's create a temporary dataframe that holds the votes encoded as integers.

In [341]:
columns = ['AffairShortId', 'CouncillorId', 'CouncillorYes', 'CouncillorNo', 'CouncillorAbstain','CouncillorNotParticipated', 'CouncillorExcused', 'CouncillorPresident']
temp = full_votes.sort_values('VoteDate', ascending=False)[columns].drop_duplicates(['AffairShortId', 'CouncillorId']).copy()
temp['VoteMeaning'] = -1
temp.loc[temp['CouncillorNo'] == 1, 'VoteMeaning'] = 0
temp.loc[temp['CouncillorYes'] == 1, 'VoteMeaning'] = 1
temp.loc[temp['CouncillorAbstain'] == 1, 'VoteMeaning'] = 2
temp.loc[temp['CouncillorNotParticipated'] == 1, 'VoteMeaning'] = 3
temp.loc[temp['CouncillorExcused'] == 1, 'VoteMeaning'] = 4
temp.loc[temp['CouncillorPresident'] == 1, 'VoteMeaning'] = 5
temp = temp[['AffairShortId', 'CouncillorId', 'VoteMeaning']].copy()
temp.head()

Unnamed: 0,AffairShortId,CouncillorId,VoteMeaning
2516805,20190024,3923,1
2516941,20190024,4158,0
2516931,20190024,4154,0
2516932,20190024,4180,1
2516933,20190024,4155,0


In [342]:
votes = temp.groupby(['AffairShortId', 'CouncillorId']).aggregate('first').unstack()
votes.columns = votes.columns.get_level_values(1)
votes = votes.fillna(-1).astype(int)
del temp
votes.head()

CouncillorId,15,21,26,28,34,61,70,74,76,91,...,4224,4225,4226,4227,4228,4229,4230,4232,4234,4236
AffairShortId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,3,1,2,-1,-1,-1,0,-1,1,...,0,0,3,0,0,1,0,0,1,1
2,-1,-1,-1,-1,-1,-1,-1,-1,-1,3,...,-1,3,-1,-1,-1,-1,-1,-1,-1,-1
20000421,1,-1,1,1,1,-1,1,-1,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
20000431,1,-1,0,0,3,-1,1,-1,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
20000436,1,-1,1,1,-1,-1,-1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


## Saving Votes

In [343]:
votes.to_csv(GENERATED_FOLDER + 'votes.csv.gz')

In [344]:
votes.head()

CouncillorId,15,21,26,28,34,61,70,74,76,91,...,4224,4225,4226,4227,4228,4229,4230,4232,4234,4236
AffairShortId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,3,1,2,-1,-1,-1,0,-1,1,...,0,0,3,0,0,1,0,0,1,1
2,-1,-1,-1,-1,-1,-1,-1,-1,-1,3,...,-1,3,-1,-1,-1,-1,-1,-1,-1,-1
20000421,1,-1,1,1,1,-1,1,-1,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
20000431,1,-1,0,0,3,-1,1,-1,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
20000436,1,-1,1,1,-1,-1,-1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [55]:
full_votes.head()

Unnamed: 0,Active,AffairShortId,AffairTitle,BirthPlace_Canton,BirthPlace_City,CantonAbbreviation,CantonName,Citizenship,CouncilName,CouncillorAbstain,...,VoteFilteredAbstain,VoteFilteredExcused,VoteFilteredNo,VoteFilteredNotParticipated,VoteFilteredPresident,VoteFilteredYes,VoteMeaningNo,VoteMeaningYes,VoteRegistrationNumber,VoteSubmissionText
0,False,20070464,Prorogation de la loi fédérale sur l'adaptatio...,Vaud,Lausanne,VD,Vaud,Paudex (VD),Conseil national,0,...,1,0,1,1,1,196,,,248,Vote final
1,False,20070464,Prorogation de la loi fédérale sur l'adaptatio...,Valais,Sierre,VD,Vaud,Champvent (VD),Conseil national,0,...,1,0,1,1,1,196,,,248,Vote final
2,False,20070464,Prorogation de la loi fédérale sur l'adaptatio...,Soleure,Soleure,VD,Vaud,"Ste-Croix (VD),Villarzel (VD)",Conseil national,0,...,1,0,1,1,1,196,,,248,Vote final
3,False,20070464,Prorogation de la loi fédérale sur l'adaptatio...,Soleure,Lüsslingen,SO,Soleure,Lüsslingen (SO),Conseil national,0,...,1,0,1,1,1,196,,,248,Vote final
4,False,20070464,Prorogation de la loi fédérale sur l'adaptatio...,Argovie,Aarau,AG,Argovie,"Schöftland (AG),Rüegsau (BE)",Conseil national,0,...,1,0,1,1,1,196,,,248,Vote final


In [59]:
full_votes.columns

Index(['Active', 'AffairShortId', 'AffairTitle', 'BirthPlace_Canton',
       'BirthPlace_City', 'CantonAbbreviation', 'CantonName', 'Citizenship',
       'CouncilName', 'CouncillorAbstain', 'CouncillorExcused', 'CouncillorId',
       'CouncillorName', 'CouncillorNo', 'CouncillorNotParticipated',
       'CouncillorPresident', 'CouncillorYes', 'DateJoining', 'DateLeaving',
       'DateOfBirth', 'DateOfDeath', 'DivisionText', 'FirstName',
       'GenderAsString', 'LastName', 'Mandates', 'MaritalStatusText',
       'ParlGroupAbbreviation', 'ParlGroupName', 'PartyAbbreviation',
       'PartyName', 'VoteDate', 'VoteFilteredAbstain', 'VoteFilteredExcused',
       'VoteFilteredNo', 'VoteFilteredNotParticipated',
       'VoteFilteredPresident', 'VoteFilteredYes', 'VoteMeaningNo',
       'VoteMeaningYes', 'VoteRegistrationNumber', 'VoteSubmissionText'],
      dtype='object')

In [60]:
full_votes['DivisionText'].drop_duplicates()

0                                               Ja
101                         Hat nicht teilgenommen
140                                     Enthaltung
146                                           Nein
166     Die Präsidentin/der Präsident stimmt nicht
2811            Entschuldigt gemäss Art. 57 Abs. 4
Name: DivisionText, dtype: object

In [92]:
counted = vote_data.groupby('AffairShortId').count().iloc[:, 0]

AffairShortId
1           4196
2             74
20000421     200
20000431    1000
20000436     200
            ... 
20193531     199
20193541     199
20193667     199
20193955     199
20193956     199
Name: AffairTitle, Length: 5037, dtype: int64

In [104]:
rows = ((counted % 200) < 10) | ((counted % 200) > 190)
rows.describe()

count     5037
unique       2
top       True
freq      5017
Name: AffairTitle, dtype: object

In [227]:
import collections
counter = collections.Counter()

def count(row):
    hashed = hash(tuple(row.values))
    counter[hashed] += 1
    return counter[hashed]

ids = vote_data[['AffairShortId', 'CouncillorName']].apply(count, axis=1).astype(str)

In [228]:
affair_ids = vote_data['AffairShortId'].astype(str) + "-" + ids
unique_affair_ids = affair_ids.drop_duplicates().values
id_df = pd.DataFrame({'Id': np.arange(len(unique_affair_ids))}, index=unique_affair_ids)

In [232]:
joined = vote_data.join(id_df, on=affair_ids)

In [272]:
c = joined[joined['AffairShortId'] == 2].groupby('CouncillorName').count()['AffairShortId']

In [278]:
c = vote_data.groupby(['AffairShortId', 'VoteRegistrationNumber']).count()

In [299]:
g = vote_data.groupby(['AffairShortId', 'VoteRegistrationNumber'])
c = g.count()

In [305]:
affairs = c.index[c.iloc[:, 0] > 200].get_level_values(0)
votes = c.index[c.iloc[:, 0] > 200].get_level_values(1)
vote_data[vote_data['AffairShortId'].isin(affairs) & vote_data['VoteRegistrationNumber'].isin(votes)]

Unnamed: 0,AffairShortId,AffairTitle,VoteRegistrationNumber,VoteDate,VoteMeaningYes,VoteMeaningNo,DivisionText,VoteSubmissionText,VoteFilteredYes,VoteFilteredNo,...,VoteFilteredExcused,VoteFilteredPresident,CouncillorId,CouncillorName,CouncillorYes,CouncillorNo,CouncillorAbstain,CouncillorNotParticipated,CouncillorExcused,CouncillorPresident
2999,20073401,Baisse d'impôt pour tout le monde,4802,2007-10-01 15:38:22+00:00,Zustimmung zur Motion,Ablehnung der Motion,Nein,,54,125,...,3,1,3833,Aubert Josiane,0,1,0,0,0,0
3000,20073401,Baisse d'impôt pour tout le monde,4802,2007-10-01 15:38:22+00:00,Zustimmung zur Motion,Ablehnung der Motion,Nein,,54,125,...,3,1,3833,Aubert Josiane,0,1,0,0,0,0
3001,20073401,Baisse d'impôt pour tout le monde,4802,2007-10-01 15:38:22+00:00,Zustimmung zur Motion,Ablehnung der Motion,Nein,,54,125,...,3,1,3830,Carobbio Guscetti Marina,0,1,0,0,0,0
3002,20073401,Baisse d'impôt pour tout le monde,4802,2007-10-01 15:38:22+00:00,Zustimmung zur Motion,Ablehnung der Motion,Nein,,54,125,...,3,1,3830,Carobbio Guscetti Marina,0,1,0,0,0,0
3003,20073401,Baisse d'impôt pour tout le monde,4802,2007-10-01 15:38:22+00:00,Zustimmung zur Motion,Ablehnung der Motion,Nein,,54,125,...,3,1,1346,Moret Isabelle,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116394,20020418,Indemnisation pour nuisances sonores dues au t...,4837,2007-10-01 19:06:37+00:00,Antrag der Kommission (Eintreten),Antrag des Bundesrates (=Nichteintreten),Ja,Eintreten,101,70,...,4,1,172,Rechsteiner Paul,1,0,0,0,0,0
116395,20020418,Indemnisation pour nuisances sonores dues au t...,4837,2007-10-01 19:06:37+00:00,Antrag der Kommission (Eintreten),Antrag des Bundesrates (=Nichteintreten),Hat nicht teilgenommen,Eintreten,101,70,...,4,1,70,Eggly Jacques-Simon,0,0,0,1,0,0
116396,20020418,Indemnisation pour nuisances sonores dues au t...,4837,2007-10-01 19:06:37+00:00,Antrag der Kommission (Eintreten),Antrag des Bundesrates (=Nichteintreten),Hat nicht teilgenommen,Eintreten,101,70,...,4,1,70,Eggly Jacques-Simon,0,0,0,1,0,0
116397,20020418,Indemnisation pour nuisances sonores dues au t...,4837,2007-10-01 19:06:37+00:00,Antrag der Kommission (Eintreten),Antrag des Bundesrates (=Nichteintreten),Ja,Eintreten,101,70,...,4,1,326,Günter Paul,1,0,0,0,0,0


In [313]:
gg = vote_data.drop_duplicates().groupby(['AffairShortId', 'VoteRegistrationNumber'])

In [314]:
cc = gg.count()

In [311]:
gg = vote_data.drop_duplicates().groupby(['AffairShortId', 'VoteRegistrationNumber', 'CouncillorName']).count()