In [2]:
# libraries
import numpy as np
import pandas as pd
import scipy 
from sklearn.ensemble import RandomForestClassifier

In [None]:
# likely need to do some kind of standardazation
# likely need to set categories
# put dataframes together
# 

In [93]:
# read in data frames
election_38_df = pd.read_csv("data/merged/38_general_election.csv")
election_39_df = pd.read_csv("data/merged/39_general_election.csv")
election_40_df = pd.read_csv("data/merged/40_general_election.csv")
election_41_df = pd.read_csv("data/merged/41_general_election.csv")
election_42_df = pd.read_csv("data/merged/42_general_election.csv")
election_43_df = pd.read_csv("data/merged/43_general_election.csv")
election_44_df = pd.read_csv("data/merged/44_general_election.csv")

In [None]:
# create a "metadata" df to help out

# REFS:
# https://laws-lois.justice.gc.ca/eng/acts/e-2.01/section-405-20040101.html#wb-cont
# finding all the election limits to normalize the amounts
# https://laws-lois.justice.gc.ca/eng/acts/E-2.01/PITIndex.html



election_caps_dict = {
    "38": 5000,
    "39": 5000,
    "40": 1000,
    "41": 1000,
    "42": 1500,
    "43": 1500,
    "44": 1500
}

election_years_dict = {
    "38": 2004,
    "39": 2006,
    "40": 2008,
    "41": 2011,
    "42": 2015,
    "43": 2019,
    "44": 2021
}

metadata_elections_df = pd.DataFrame({
    "election": election_caps_dict.keys(),
    "contribution_cap": election_caps_dict.values(),
    "election_year": election_years_dict.values()
})

metadata_elections_df

Unnamed: 0,election,contribution_cap,election_year
0,38,5000,2004
1,39,5000,2006
2,40,1000,2008
3,41,1000,2011
4,42,1500,2015
5,43,1500,2019
6,44,1500,2021


In [57]:
# pull in inflation CPIs for adjusting to inflation

cpi_df = pd.read_csv('data/cpi.csv')
cpi_df = cpi_df[cpi_df['Products and product groups'] == 'All-items'][['REF_DATE', 'VALUE']]

cpi_df.head()

Unnamed: 0,REF_DATE,VALUE
0,2004,104.7
1,2005,107.0
2,2006,109.1
3,2007,111.5
4,2008,114.1


In [58]:
# rebase the CPI to 2004 = 100, since the data is using 2002 = 100
base_2004_cpi = cpi_df.loc[cpi_df['REF_DATE'] == 2004]['VALUE'][0]
cpi_df['VALUE'] = cpi_df['VALUE']/base_2004_cpi*100
cpi_df.head()

Unnamed: 0,REF_DATE,VALUE
0,2004,100.0
1,2005,102.196753
2,2006,104.202483
3,2007,106.494747
4,2008,108.978032


In [44]:
metadata_elections_df = metadata_elections_df.copy().merge(cpi_df, left_on='election_year', right_on='REF_DATE')
metadata_elections_df

Unnamed: 0,election,contribution_cap,election_year,REF_DATE,VALUE
0,38,5000,2004,2004,100.0
1,39,5000,2006,2006,104.202483
2,40,1000,2008,2008,108.978032
3,41,1000,2011,2011,114.51767
4,42,1500,2015,2015,120.916905
5,43,1500,2019,2019,129.894938
6,44,1500,2021,2021,135.243553


In [85]:
# setting up preprocessing for all the frames

columns_rename_map = {
    "Recipient ID": "recipient_id",
    "Political Party of Recipient": "political_party",
    "ED_CODE": "electoral_district", #use the code instead, need to remember to set this as categorical
    "Monetary amount": "monetary_amount",
    "Percentage of Votes Obtained /Pourcentage des votes obtenus": "percentage_vote"
}

def preprocess(df, election_number):
   
   # drop unneeded columns
   df = df.drop(columns=['Unnamed: 0', 'Political Entity', 'Electoral event', 'Fiscal/Election date', 
                        'Recipient last name', 'Recipient first name','Contributor City', 'Contributor Province', 
                        'Contributor Postal code', 'Contribution Received date', 'cleaned_district_name_x', 'Electoral District', 
                        'Recipient'], 
                errors='ignore')
    
   # remove any 0 monetary amounts - these are probably residuals from the non-monetary amounts, not relevant to our modelling here 
   df = df[df['Monetary amount'] != 0]

   # rename columns for consistency and ease 
   df = df.rename(columns=columns_rename_map, errors = 'ignore')

   # adjust for inflation, normalize to the contribution cap
   cpi = metadata_elections_df[metadata_elections_df['election'] == election_number]['VALUE'].iloc[0]
   cap = metadata_elections_df[metadata_elections_df['election'] == election_number]['contribution_cap'].iloc[0]

   df['monetary_amount'] = df['monetary_amount']/cap *cpi/100

   # add in election number
   df['election_number'] = election_number

   # groupby and agg
   df = df.groupby('recipient_id').agg(
      # aggregate functions 
      monetary_sum = pd.NamedAgg(column= 'monetary_amount', aggfunc="sum"), 
      num_donations = pd.NamedAgg(column= 'monetary_amount', aggfunc='count'),
      monetary_mean = pd.NamedAgg(column= "monetary_amount", aggfunc = 'mean'),
      monetary_std = pd.NamedAgg(column= 'monetary_amount', aggfunc= 'std'),
      monetary_min = pd.NamedAgg(column= 'monetary_amount', aggfunc= 'min'),
      monetary_max = pd.NamedAgg(column= 'monetary_amount', aggfunc= 'max'),

      # agg for the other columns
      political_party = pd.NamedAgg(column= 'political_party', aggfunc= 'first'),
      electoral_district = pd.NamedAgg(column = 'electoral_district', aggfunc= 'first'),
      percentage_vote = pd.NamedAgg(column = 'percentage_vote', aggfunc = 'first'),
      election_number = pd.NamedAgg(column = 'election_number', aggfunc = 'first')

   )
    
   return df

In [96]:
# check if recipient id is consistent across the years if needed

# process all the election dataframes, hold 44th back as the test dataset
election_38_processed_df = preprocess(election_38_df, "38")
election_39_processed_df = preprocess(election_39_df, "39")
election_40_processed_df = preprocess(election_40_df, "40")
election_41_processed_df = preprocess(election_41_df, "41")
election_42_processed_df = preprocess(election_42_df, "42")
election_43_processed_df = preprocess(election_43_df, "43")
election_44_processed_df = preprocess(election_44_df, "44")

test_df = election_44_processed_df

In [None]:
# stack the remaining frames together as the training set (debate if I want to do a train-validation split?? and whether I'd do it chronilogically or not)


