# Principal Component Analysis

In [32]:
import pandas as pd
import numpy as np
import pickle
import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

from imblearn.pipeline import Pipeline as imbPipeline

In [3]:
X_train = pd.read_csv('Data/X_train.csv', index_col='building_id')
X_test = pd.read_csv('Data/X_test.csv', index_col='building_id')
y_train = pd.read_csv('Data/y_train.csv', index_col='building_id')
y_test = pd.read_csv('Data/y_test.csv', index_col='building_id')

In [4]:
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

### Divide Columns

In [5]:
binary_cols = []
for col in X_train.columns:
    if col.startswith('has'):
        binary_cols.append(col)

cat_cols = list(X_train.select_dtypes(include='object').columns)

integer_cols = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']

geo_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

all_cols = geo_cols + cat_cols + integer_cols + binary_cols

### Try OHE geo_level(s) and performing PCA to reduce dimensionality

One Hot Encoding the three geographic columns creates nearly 12,000 columns.  One approach to dealing with this is to use PCA to reduce dimensionality.  First, I will split the training and test sets in half to reduce computational needs. After using ColumnTransformer to OHE and scale the new training set, I will use PCA.  I would like to select n_components such that I have explained variance > 80%.

In [12]:
X_train_pca_split, X_train_pca_leftover, y_train_pca_split, y_train_pca_leftover \
= train_test_split(X_train, y_train, test_size=.5, random_state=42)

In [16]:
ohe_pipe = imbPipeline([('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
ss_pipe = imbPipeline([('ss', StandardScaler())])

transformer = ColumnTransformer([
    ('binary', 'passthrough', binary_cols),
    ('categorical', ohe_pipe, cat_cols),
    ('geo', ohe_pipe, geo_cols),
    ('integer_scale', ss_pipe, integer_cols)
    
])

In [17]:
transformed = transformer.fit_transform(X_train_pca_split)

In [18]:
X_train_for_pca = pd.DataFrame(transformed, index=X_train_pca_split.index)

In [19]:
X_train_for_pca

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,11915,11916,11917,11918,11919,11920,11921,11922,11923,11924
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
496285,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.181783,0.907456,0.896730,-0.751181,0.040255
792302,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.181783,-0.289890,-0.004983,-0.751181,0.040255
160618,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.194795,-0.156852,-0.455839,0.291289,0.040255
76131,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.194795,0.042706,-0.455839,-0.229946,0.040255
790382,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-1.558361,-0.289890,-0.004983,-0.751181,0.040255
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214724,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.181783,-0.090333,-0.906695,-0.751181,0.040255
375633,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.194795,0.042706,-0.455839,0.812523,0.040255
471245,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.181783,-0.223371,-0.230411,-0.229946,2.430265
596429,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.181783,-0.156852,2.474727,-0.751181,0.040255


In [20]:
start = time.time()
pca = PCA(n_components=60)
pca_transformed = pca.fit_transform(X_train_for_pca)
end = time.time()
print(f'Run time: {end-start}')

Run time: 1462.9741060733795


In [23]:
sum(list(pca.explained_variance_ratio_))

0.8238236248702576

After some iterations, I found the n_components=60 gives an explained variance of 82%.  The runtime, however, was over 24 minutes on only half of the original training set.

In [24]:
with open ('../Models/pca_transformed.pickle', 'wb') as f:
    pickle.dump(pca_transformed, f, pickle.HIGHEST_PROTOCOL)

In [26]:
len(y_train_pca_split), len(pca_transformed)

(97725, 97725)

Let's try using the PCA transformed data in an untuned Random Forest Classifier, and then compare that with the non-PCA model.

In [28]:
rf_pca = RandomForestClassifier(random_state=42)
rf_pca.fit(pca_transformed, y_train_pca_split)

RandomForestClassifier(random_state=42)

In [29]:
start = time.time()
f1_micro_rf_pca = cross_val_score(rf_pca, pca_transformed, y_train_pca_split, scoring='f1_micro')
end = time.time()
print(f'Run time: {end-start}')
run_time_rf_pca = end-start

Run time: 385.38703298568726


In [31]:
f1_micro_rf_pca = f1_micro_rf_pca.mean()
f1_micro_rf_pca

0.677830647224354

Given