In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pickle
import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

from imblearn.pipeline import Pipeline as imbPipeline

In [2]:
X_train = pd.read_csv('Data/X_train.csv', index_col='building_id')
X_test = pd.read_csv('Data/X_test.csv', index_col='building_id')
y_train = pd.read_csv('Data/y_train.csv', index_col='building_id')
y_test = pd.read_csv('Data/y_test.csv', index_col='building_id')

In [3]:
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

### Divide Columns

In [4]:
binary_cols = []
for col in X_train.columns:
    if col.startswith('has'):
        binary_cols.append(col)

cat_cols = list(X_train.select_dtypes(include='object').columns)

integer_cols = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']

geo_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

all_cols = geo_cols + cat_cols + integer_cols + binary_cols

### Try OHE geo_level(s) and performing PCA to reduce dimensionality

In [31]:
X_train_pca_split, X_train_pca_leftover = train_test_split(X_train, test_size=.8, random_state=42)

In [32]:
ohe_pipe = imbPipeline([('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
ss_pipe = imbPipeline([('ss', StandardScaler())])

transformer = ColumnTransformer([
    ('binary', 'passthrough', binary_cols),
    ('categorical', ohe_pipe, cat_cols),
    ('geo', ohe_pipe, geo_cols),
    ('integer_scale', ss_pipe, integer_cols)
    
])

In [33]:
transformed = transformer.fit_transform(X_train_pca_split)

In [34]:
X_train_for_pca = pd.DataFrame(transformed, index=X_train_pca_split.index)

In [35]:
start = time.time()
pca = PCA(n_components=60)
pca_transformed = pca.fit_transform(X_train_for_pca)
end = time.time()
print(f'Run time: {end-start}')

Run time: 35.65352129936218


In [36]:
sum(list(pca.explained_variance_ratio_))

0.8242910655133853

In [37]:
transformed = transformer.fit_transform(X_train)

In [38]:
X_train_for_pca = pd.DataFrame(transformed, index=X_train.index)

In [None]:
start = time.time()
pca = PCA(n_components=60)
pca_transformed = pca.fit_transform(X_train_for_pca)
end = time.time()
print(f'Run time: {end-start}')

In [None]:
sum(list(pca.explained_variance_ratio_))