In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.decomposition import PCA
from sklearn.externals import joblib

%matplotlib inline
np.random.seed(42)

In [6]:
# Load data
train_df = pd.read_csv("~/real_estate/data/supervised_dfs/train_df.csv")
test_df = pd.read_csv("~/real_estate/data/supervised_dfs/test_df.csv")

features = train_df.columns[:-2]

X_train = train_df[features]
y_train = train_df["target"]
X_test = test_df[features]
y_test = test_df["target"]

### PCA

In [9]:
# Fit PCA
pca = PCA(n_components=20)
pca.fit(X_train.values)
print("Total explained variance: ", sum(pca.explained_variance_ratio_))

Total explained variance:  0.9723580960421312


In [11]:
# Reduce dimensionality
train_df_pca = pca.transform(X_train.values)
train_df_pca = pd.DataFrame(train_df_pca)
train_df_pca["target"] = y_train.values

test_df_pca = pca.transform(X_test.values)
test_df_pca = pd.DataFrame(test_df_pca)
test_df_pca["target"] = y_test.values
test_df_pca["RegionName"] = test_df["RegionName"].values

In [12]:
train_df_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,target
0,1.053341,0.16876,-0.694155,0.483232,0.723046,0.44298,-0.076065,-0.065262,0.085768,-0.05793,...,-0.027939,-0.150362,0.04881,0.008362,-0.09781,0.178369,0.049891,-0.05379,-0.05667,0.029412
1,1.059905,0.121767,-0.51931,0.009912,0.516214,0.123516,-0.229145,-0.006321,0.079884,-0.054503,...,-0.009262,-0.086371,-0.067583,-0.043621,-0.134146,0.084398,0.165527,-0.002299,0.01345,0.038377
2,1.146398,-0.004254,-0.508459,0.306215,0.723323,0.483823,-0.054473,-0.113258,0.132147,-0.023382,...,-0.176188,-0.039192,-0.002552,0.094392,0.014893,0.204564,0.114278,-0.101138,-0.049786,0.002611
3,1.042201,0.165194,-0.470465,0.365697,0.684422,0.299196,-0.162731,0.028511,0.169432,0.073658,...,0.164082,-0.100171,-0.161797,-0.020688,-0.144555,0.194417,0.047782,-0.005752,0.064454,0.006562
4,1.13882,0.020594,-0.409824,0.408724,0.768427,0.49352,-0.066903,-0.025369,0.168024,0.052843,...,0.157402,-0.216737,-0.003585,0.034149,-0.111206,0.170985,0.047969,-0.107796,0.131701,0.07003


In [17]:
train_df_pca.to_csv("~/real_estate/data/supervised_dfs/train_df_pca.csv",
                    index=False)
test_df_pca.to_csv("~/real_estate/data/supervised_dfs/test_df_pca.csv",
                    index=False)
joblib.dump(pca, "../real_estate/data/supervised_dfs/pca.pkl")

['../real_estate/data/supervised_dfs/pca.pkl']