In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.externals import joblib

%matplotlib inline
np.random.seed(42)

In [15]:
# Load data
train_df = pd.read_csv("~/real_estate/data/v3/train_df.csv")
test_df = pd.read_csv("~/real_estate/data/v3/test_df.csv")

features = train_df.columns[:-2]

X_train = train_df[features]
y_train = train_df["target"]
X_test = test_df[features]
y_test = test_df["target"]

### PCA

In [16]:
# Fit PCA
pca = PCA(n_components=45)
pca.fit(X_train.values)
print("Total explained variance: ", sum(pca.explained_variance_ratio_))

Total explained variance:  0.9709950922656515


In [17]:
# Reduce dimensionality
train_df_pca = pca.transform(X_train.values)
train_df_pca = pd.DataFrame(train_df_pca)
train_df_pca["target"] = y_train.values

test_df_pca = pca.transform(X_test.values)
test_df_pca = pd.DataFrame(test_df_pca)
test_df_pca["target"] = y_test.values
test_df_pca["RegionName"] = test_df["RegionName"].values

In [18]:
train_df_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,target
0,0.769774,1.28972,-0.339299,0.448403,0.130888,-0.363977,0.277324,-0.076173,-0.105135,0.279229,...,-0.045078,-0.043588,-0.261616,-0.000699,-0.063382,0.011333,0.063051,-0.103256,-0.035064,0.188408
1,1.022718,0.312927,0.00272,0.584901,0.228945,-0.198738,0.074624,-0.046097,0.133687,0.262805,...,-0.120242,0.337716,0.269736,0.108709,-0.070552,-0.006704,0.076115,-0.126972,-0.100862,0.085974
2,0.897084,1.138851,-0.430117,0.391603,0.08385,-0.487456,0.270143,-0.131508,-0.177171,0.266471,...,-0.059536,0.033441,-0.219111,-0.029587,-0.050392,-0.010655,0.039289,-0.068882,-0.126445,0.041163
3,1.001397,0.462175,-0.108083,0.56635,0.454278,-0.171491,0.213567,-0.163052,0.109252,0.272907,...,-0.128252,0.306655,0.18881,0.123571,-0.095824,0.001162,0.04963,-0.081253,-0.072679,0.04577
4,1.097501,0.456392,-0.281769,0.476748,0.583505,-0.277336,0.347903,-0.339411,0.001399,0.259739,...,-0.042475,-0.029978,-0.119702,0.015631,-0.078388,0.003953,0.057575,-0.082745,-0.115069,0.064111


In [19]:
train_df_pca.to_csv("~/real_estate/data/v3/train_df_pca.csv",index=False)
test_df_pca.to_csv("~/real_estate/data/v3/test_df_pca.csv",index=False)
joblib.dump(pca, "/home/gnazareths/real_estate/data/v3/pca.pkl")

['/home/gnazareths/real_estate/data/v3/pca.pkl']