In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..") 

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.subplots as ps
import plotly.graph_objects as go

import src.preprocessing as pp

In [2]:
dfi = pd.read_csv('../data/train.csv').drop('Id', axis=1)
dfi_test = pd.read_csv('../data/test.csv').drop('Id', axis=1)

print('Train set shape: ', dfi.shape)
print('Test set shape: ', dfi_test.shape)

Train set shape:  (1460, 80)
Test set shape:  (1459, 79)


In [3]:
pp.plot_log(dfi, ['SalePrice'])

## 1. Missing values

In [4]:
dfm = pp.fill_na(dfi)
dfm_test = pp.fill_na(dfi_test)

In [5]:
null_col = dfm.isnull().sum()
null_col[null_col > 0].sort_values(ascending=False)

Series([], dtype: int64)

In [6]:
null_col = dfm_test.isnull().sum()
null_col[null_col > 0].sort_values(ascending=False)

Series([], dtype: int64)

## 2. Data Exploration

In [7]:
pp.hist_matrix(dfm)

## 3. Feature Engineering

In [8]:
dfe = pp.engineer_features(dfm)
dfe_test = pp.engineer_features(dfm_test)

In [9]:
pp.plot_log(dfe, ['SalePrice', 'LotArea', '1stFlrSF', '2ndFlrSF', 'TotalBsmtSF','TotalSF', 'CondArea', 'QualArea', 'GrLivArea', 'LotFrontage'], cols=2)

## 4. Feature Transformation

In [10]:
dft = pp.transform_attributes(dfe)
dft_test = pp.transform_attributes(dfe_test)


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [11]:
from sklearn.model_selection import train_test_split

X = dft.drop('SalePrice', axis=1)
y = dft['SalePrice']

X_train, X_val, y_train, y_val = train_test_split(X, y, shuffle=True, random_state=21)

X_test = dft_test.copy()

### 4.1 Validation Datasets

In [12]:
from category_encoders import TargetEncoder

enc = TargetEncoder(return_df=True)
X_train_enc = enc.fit_transform(X_train, y_train)
X_val_enc = enc.transform(X_val)

df_train = X_train_enc.copy()
df_train['SalePrice'] = y_train

df_val = X_val_enc.copy()
df_val['SalePrice'] = y_val

### 4.2 Test Datasets

In [13]:
from category_encoders import TargetEncoder

enc = TargetEncoder(return_df=True)
X_enc = enc.fit_transform(X, y)
X_test_enc = enc.transform(X_test)

df = X_enc.copy()
df['SalePrice'] = y

df_test = X_test_enc.copy()

### 4.3 Log Encoding

In [14]:
attr_to_log_train = ['SalePrice', 'LotArea', '1stFlrSF', '2ndFlrSF', 'TotalBsmtSF','TotalSF', 'CondArea', 'QualArea', 'GrLivArea', 'LotFrontage']
attr_to_log_test = ['LotArea', '1stFlrSF', '2ndFlrSF', 'TotalBsmtSF','TotalSF', 'CondArea', 'QualArea', 'GrLivArea', 'LotFrontage']

In [15]:
dfl_train = pp.log_features(df_train, attr_to_log_train)
dfl_val = pp.log_features(df_val, attr_to_log_test)

dfl = pp.log_features(df, attr_to_log_train)
dfl_test = pp.log_features(df_test, attr_to_log_test)

## 5. Normalization

In [16]:
pp.compute_norm_values_and_save(dfl_train, 'val')
dfn_train = pp.normalize(dfl_train, 'val')
dfn_val = pp.normalize(dfl_val, 'val')

In [17]:
pp.compute_norm_values_and_save(dfl, 'test')
dfn = pp.normalize(dfl, 'test')
dfn_test = pp.normalize(dfl_test, 'test')

In [18]:
pp.hist_matrix(dfn)

## 6. Save

In [19]:
dfn_train.to_csv('../data/preprocessed/dfn_train.csv', index=False)
dfn_val.to_csv('../data/preprocessed/dfn_val.csv', index=False)
dfn.to_csv('../data/preprocessed/dfn.csv', index=False)
dfn_test.to_csv('../data/preprocessed/dfn_test.csv', index=False)