# Preprocessing

Due to the feature engineering and data wrangling done previously, this notebook will be relatively brief.

In [2]:
#import relevant packages
import numpy as np
import pandas as pd
from io import StringIO
import requests
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
#get path extention
numeric = '1xvv81F7_NoHaf0qObgWcMWZDLi_w2S0_'
target = '1KQNJU82wtbl5pU_2ko5q_-8vVCt0GTqj'

#set file path
download = 'https://drive.google.com/uc?export=download&id='
path_numeric = download + numeric
path_target = download + target

#get raw data
raw_numeric = StringIO(requests.get(path_numeric).text)
raw_target = StringIO(requests.get(path_target).text)

In [4]:
#load into DataFrames
data_numeric = pd.read_csv(raw_numeric)
data_target = pd.read_csv(raw_target)

In [11]:
#reshape data to reflext different target variables
X = data_numeric.drop(columns = 'bill_id')
y_active = np.array(pd.get_dummies(data_target.active, drop_first = True)).reshape(18013)
y_house = np.array(pd.get_dummies(data_target.house_passage.notnull(), drop_first = True)).reshape(18013)
y_senate = np.array(pd.get_dummies(data_target.senate_passage.notnull(), drop_first = True)).reshape(18013)
y_enacted = np.array(pd.get_dummies(data_target.enacted.notnull(), drop_first = True)).reshape(18013)

print('Number of Observations X Number of Features:')
print(X.shape)
print()
print('Number of Active Bills:')
print(sum(y_active))
print()
print('Number of Bills which Passed the House:')
print(sum(y_house))
print()
print('Number of Bills which Passed the Senate:')
print(sum(y_senate))
print()
print('Number of Bills Enacted:')
print(sum(y_enacted))

Number of Observations X Number of Features:
(18013, 18)

Number of Active Bills:
3991

Number of Bills which Passed the House:
2112

Number of Bills which Passed the Senate:
636

Number of Bills Enacted:
559


In [12]:
#scale X
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [13]:
#perform train/test splits
#the same random state will be used for all to ensure that the same bills are trained on for each
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X_scaled, y_active, 
                                                            stratify = y_active, test_size = .2, random_state = 42)

X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_scaled, y_house, 
                                                            stratify = y_house, test_size = .2, random_state = 42)

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_scaled, y_senate, 
                                                            stratify = y_senate, test_size = .2, random_state = 42)

X_train_e, X_test_e, y_train_e, y_test_e = train_test_split(X_scaled, y_enacted, 
                                                            stratify = y_enacted, test_size = .2, random_state = 42)