In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.externals import joblib

%matplotlib inline
np.random.seed(42)

In [20]:
# Load data
train_df = pd.read_csv("~/real_estate/data/v2/train_df.csv")
test_df = pd.read_csv("~/real_estate/data/v2/test_df.csv")

features = train_df.columns[:-2]

X_train = train_df[features]
y_train = train_df["target"]
X_test = test_df[features]
y_test = test_df["target"]

### PCA

In [21]:
# Fit PCA
pca = PCA(n_components=45)
pca.fit(X_train.values)
print("Total explained variance: ", sum(pca.explained_variance_ratio_))

Total explained variance:  0.9709943781125696


In [22]:
# Reduce dimensionality
train_df_pca = pca.transform(X_train.values)
train_df_pca = pd.DataFrame(train_df_pca)
train_df_pca["target"] = y_train.values

test_df_pca = pca.transform(X_test.values)
test_df_pca = pd.DataFrame(test_df_pca)
test_df_pca["target"] = y_test.values
test_df_pca["RegionName"] = test_df["RegionName"].values

In [23]:
train_df_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,target
0,0.769774,1.28972,-0.339299,0.448403,0.130888,-0.363977,0.277324,-0.076173,-0.105135,0.279229,...,-0.045079,-0.043659,-0.26152,-0.000518,-0.063229,0.011266,0.062936,-0.103356,-0.036167,0.029412
1,1.022718,0.312927,0.00272,0.584901,0.228945,-0.198738,0.074624,-0.046097,0.133687,0.262805,...,-0.120263,0.337827,0.269584,0.108302,-0.070697,-0.006409,0.075724,-0.126708,-0.100691,0.038377
2,0.897084,1.138851,-0.430117,0.391603,0.08385,-0.487456,0.270143,-0.131508,-0.177171,0.266471,...,-0.059482,0.033473,-0.2191,-0.029689,-0.050547,-0.010527,0.038913,-0.068643,-0.126397,0.002611
3,1.001397,0.462175,-0.108083,0.56635,0.454278,-0.171491,0.213567,-0.163052,0.109252,0.272907,...,-0.128273,0.306754,0.188686,0.123314,-0.095866,0.001292,0.049306,-0.081094,-0.072443,0.006562
4,1.097501,0.456392,-0.281769,0.476748,0.583505,-0.277336,0.347903,-0.339411,0.001399,0.259739,...,-0.042493,-0.030033,-0.119654,0.015758,-0.078303,0.004009,0.05755,-0.082779,-0.115715,0.07003


In [24]:
train_df_pca.to_csv("~/real_estate/data/v2/train_df_pca.csv",index=False)
test_df_pca.to_csv("~/real_estate/data/v2/test_df_pca.csv",index=False)
joblib.dump(pca, "/home/gnazareths/real_estate/data/v2/pca.pkl")

['/home/gnazareths/real_estate/data/v2/pca.pkl']