# Preprocessing and Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score

### Read cleaned final csv files.

In [2]:
df = pd.read_csv('../datasets/train_cleaned_final.csv')
df_to_pred = pd.read_csv('../datasets/test_cleaned_final.csv')
df.drop('Unnamed: 0', axis = 1, inplace = True)
df_to_pred.drop('Unnamed: 0', axis = 1, inplace = True)

#### Set up the predictors and target

In [3]:
features = df.columns.drop('saleprice')
X = df[features]
y = df['saleprice']

In [4]:
X.head()

Unnamed: 0,lot_frontage,lot_area,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,1st_flr_sf,...,garage_type_NA,sale_type_COD,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD
0,0.0,13517.0,1976.0,2005.0,289.0,533.0,0.0,192.0,725.0,725.0,...,0,0,0,0,0,0,0,0,0,1
1,43.0,11492.0,1996.0,1997.0,132.0,637.0,0.0,276.0,913.0,913.0,...,0,0,0,0,0,0,0,0,0,1
2,68.0,7922.0,1953.0,2007.0,0.0,731.0,0.0,326.0,1057.0,1057.0,...,0,0,0,0,0,0,0,0,0,1
3,73.0,9802.0,2006.0,2007.0,0.0,0.0,0.0,384.0,384.0,744.0,...,0,0,0,0,0,0,0,0,0,1
4,82.0,14235.0,1900.0,1993.0,0.0,0.0,0.0,676.0,676.0,831.0,...,0,0,0,0,0,0,0,0,0,1


### Let's use the numeric features to get some polynomial features

In [5]:
# Use PolynomialFeatures.
poly = PolynomialFeatures(include_bias=False)

# Fit and transform.
X_poly = poly.fit_transform(X)

In [6]:
# Polynomial feature names.
poly_col_names = poly.get_feature_names(features)

In [7]:
# Polynomeal features of numeric valus columns.
X_poly_df = pd.DataFrame(X_poly, columns=poly_col_names)
X_poly_df.head()

Unnamed: 0,lot_frontage,lot_area,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,1st_flr_sf,...,sale_type_ConLw^2,sale_type_ConLw sale_type_New,sale_type_ConLw sale_type_Oth,sale_type_ConLw sale_type_WD,sale_type_New^2,sale_type_New sale_type_Oth,sale_type_New sale_type_WD,sale_type_Oth^2,sale_type_Oth sale_type_WD,sale_type_WD ^2
0,0.0,13517.0,1976.0,2005.0,289.0,533.0,0.0,192.0,725.0,725.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,43.0,11492.0,1996.0,1997.0,132.0,637.0,0.0,276.0,913.0,913.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,68.0,7922.0,1953.0,2007.0,0.0,731.0,0.0,326.0,1057.0,1057.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,73.0,9802.0,2006.0,2007.0,0.0,0.0,0.0,384.0,384.0,744.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,82.0,14235.0,1900.0,1993.0,0.0,0.0,0.0,676.0,676.0,831.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Combine polynomial features and target.

In [8]:
# This is predictor with saleprice. (Combined data)
df_combined = X_poly_df
df_combined['saleprice'] = df['saleprice']
df_combined.head()

Unnamed: 0,lot_frontage,lot_area,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,1st_flr_sf,...,sale_type_ConLw sale_type_New,sale_type_ConLw sale_type_Oth,sale_type_ConLw sale_type_WD,sale_type_New^2,sale_type_New sale_type_Oth,sale_type_New sale_type_WD,sale_type_Oth^2,sale_type_Oth sale_type_WD,sale_type_WD ^2,saleprice
0,0.0,13517.0,1976.0,2005.0,289.0,533.0,0.0,192.0,725.0,725.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,130500.0
1,43.0,11492.0,1996.0,1997.0,132.0,637.0,0.0,276.0,913.0,913.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,220000.0
2,68.0,7922.0,1953.0,2007.0,0.0,731.0,0.0,326.0,1057.0,1057.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,109000.0
3,73.0,9802.0,2006.0,2007.0,0.0,0.0,0.0,384.0,384.0,744.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,174000.0
4,82.0,14235.0,1900.0,1993.0,0.0,0.0,0.0,676.0,676.0,831.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,138500.0


## Correlation Matrix

#### It would be nice to visualize the heatmap of the correlation matrix, but we will skip it  since it will take too long.

In [9]:
# plt.figure(figsize=(3,700))
# sns.heatmap(df_combined.corr()[['saleprice']].sort_values('saleprice'), annot = True)

In [None]:
# Filter the column and sort by the correlation descending order.
corr = df_combined.corr()[['saleprice']].sort_values(by='saleprice', ascending = False)
# corr

In [30]:
# Let's take top 100 column(features) that has high correlation with salesprice.
# key_features = corr.head(201).index
# key_features = key_features.drop('saleprice')
# len(key_features)

### Train Test Split

In [31]:
X = df_combined.drop('saleprice', axis = 1)
y = df_combined['saleprice']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

#### Standardize the train set, fit and transform.

In [33]:
ss = StandardScaler()
ss.fit(X_train)

X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

In [34]:
# Check the train set and test set.
print(X_train.shape)
print(X_test.shape)

(1502, 43364)
(501, 43364)


#### Try three regression model and get the cross val score.

In [35]:
lr = LinearRegression()
lasso = LassoCV()
ridge = RidgeCV()
elasticnet = ElasticNetCV()

In [36]:
cross_val_score(lr, X_train_sc, y_train, cv=3).mean()

0.8606045928828786

In [37]:
cross_val_score(elasticnet, X_train_sc, y_train, cv=3).mean()



0.8741523044607348

In [38]:
cross_val_score(ridge, X_train_sc, y_train, cv=3).mean()

0.8624400787959617

In [39]:
cross_val_score(lasso, X_train_sc, y_train, cv=3).mean()



0.8715331702646302

In [56]:
elasticnet.fit(X_train_sc, y_train)



ElasticNetCV(alphas=None, copy_X=True, cv='warn', eps=0.001,
       fit_intercept=True, l1_ratio=0.5, max_iter=1000, n_alphas=100,
       n_jobs=None, normalize=False, positive=False, precompute='auto',
       random_state=None, selection='cyclic', tol=0.0001, verbose=0)

In [57]:
elasticnet.score(X_test_sc, y_test)

0.9025538815821834

In [58]:
elasticnet.score(X_train_sc, y_train)

0.926521611290877

### Test holdout set

In [43]:
df_to_pred

Unnamed: 0,lot_frontage,lot_area,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,1st_flr_sf,...,garage_type_NA,sale_type_COD,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD
0,69.0,9142.0,1910.0,1950.0,0.0,0.0,0.0,1020.0,1020.0,908.0,...,0,0,0,0,0,0,0,0,0,1
1,0.0,9662.0,1977.0,1977.0,0.0,0.0,0.0,1967.0,1967.0,1967.0,...,0,0,0,0,0,0,0,0,0,1
2,58.0,17104.0,2006.0,2006.0,0.0,554.0,0.0,100.0,654.0,664.0,...,0,0,0,0,0,0,0,1,0,0
3,60.0,8520.0,1923.0,2006.0,0.0,0.0,0.0,968.0,968.0,968.0,...,0,0,0,0,0,0,0,0,0,1
4,0.0,9500.0,1963.0,1963.0,247.0,609.0,0.0,785.0,1394.0,1394.0,...,0,0,0,0,0,0,0,0,0,1
5,21.0,1890.0,1972.0,1972.0,0.0,294.0,0.0,252.0,546.0,546.0,...,0,0,0,0,0,0,0,0,0,1
6,52.0,8516.0,1958.0,2006.0,0.0,0.0,0.0,869.0,869.0,1093.0,...,0,0,0,0,0,0,0,0,0,1
7,0.0,9286.0,1977.0,1989.0,0.0,196.0,0.0,1072.0,1268.0,1268.0,...,0,0,0,0,0,0,0,0,0,1
8,39.0,3515.0,2004.0,2004.0,0.0,0.0,0.0,840.0,840.0,840.0,...,0,0,0,0,0,0,0,0,0,1
9,75.0,10125.0,1977.0,1977.0,0.0,641.0,279.0,276.0,1196.0,1279.0,...,0,0,0,0,0,0,0,0,0,1


In [44]:
X_holdout = df_to_pred[features]
X_holdout

Unnamed: 0,lot_frontage,lot_area,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,1st_flr_sf,...,garage_type_NA,sale_type_COD,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD
0,69.0,9142.0,1910.0,1950.0,0.0,0.0,0.0,1020.0,1020.0,908.0,...,0,0,0,0,0,0,0,0,0,1
1,0.0,9662.0,1977.0,1977.0,0.0,0.0,0.0,1967.0,1967.0,1967.0,...,0,0,0,0,0,0,0,0,0,1
2,58.0,17104.0,2006.0,2006.0,0.0,554.0,0.0,100.0,654.0,664.0,...,0,0,0,0,0,0,0,1,0,0
3,60.0,8520.0,1923.0,2006.0,0.0,0.0,0.0,968.0,968.0,968.0,...,0,0,0,0,0,0,0,0,0,1
4,0.0,9500.0,1963.0,1963.0,247.0,609.0,0.0,785.0,1394.0,1394.0,...,0,0,0,0,0,0,0,0,0,1
5,21.0,1890.0,1972.0,1972.0,0.0,294.0,0.0,252.0,546.0,546.0,...,0,0,0,0,0,0,0,0,0,1
6,52.0,8516.0,1958.0,2006.0,0.0,0.0,0.0,869.0,869.0,1093.0,...,0,0,0,0,0,0,0,0,0,1
7,0.0,9286.0,1977.0,1989.0,0.0,196.0,0.0,1072.0,1268.0,1268.0,...,0,0,0,0,0,0,0,0,0,1
8,39.0,3515.0,2004.0,2004.0,0.0,0.0,0.0,840.0,840.0,840.0,...,0,0,0,0,0,0,0,0,0,1
9,75.0,10125.0,1977.0,1977.0,0.0,641.0,279.0,276.0,1196.0,1279.0,...,0,0,0,0,0,0,0,0,0,1


#### Apply the same polynomial features to the holdout set.

In [45]:
X_poly = poly.fit_transform(X_holdout)
poly_col_names = poly.get_feature_names(features)

X_poly_holdout = pd.DataFrame(X_poly, columns=poly_col_names)
X_poly_holdout.head()

Unnamed: 0,lot_frontage,lot_area,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,1st_flr_sf,...,sale_type_ConLw^2,sale_type_ConLw sale_type_New,sale_type_ConLw sale_type_Oth,sale_type_ConLw sale_type_WD,sale_type_New^2,sale_type_New sale_type_Oth,sale_type_New sale_type_WD,sale_type_Oth^2,sale_type_Oth sale_type_WD,sale_type_WD ^2
0,69.0,9142.0,1910.0,1950.0,0.0,0.0,0.0,1020.0,1020.0,908.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,9662.0,1977.0,1977.0,0.0,0.0,0.0,1967.0,1967.0,1967.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,58.0,17104.0,2006.0,2006.0,0.0,554.0,0.0,100.0,654.0,664.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,60.0,8520.0,1923.0,2006.0,0.0,0.0,0.0,968.0,968.0,968.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,9500.0,1963.0,1963.0,247.0,609.0,0.0,785.0,1394.0,1394.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


#### Standard scalar fit

In [47]:
X_poly_holdout_sc = ss.transform(X_poly_holdout)
X_poly_holdout_sc

array([[ 0.35533263, -0.13671618, -2.06960319, ..., -0.04473627,
         0.        ,  0.39418855],
       [-1.73967039, -0.06285439,  0.16647127, ..., -0.04473627,
         0.        ,  0.39418855],
       [ 0.02134664,  0.99422141,  1.13432439, ..., -0.04473627,
         0.        , -2.53685702],
       ...,
       [-0.06974045, -0.26341755, -0.13389694, ..., -0.04473627,
         0.        ,  0.39418855],
       [ 0.08207137, -0.15688613, -0.0337742 , ..., -0.04473627,
         0.        ,  0.39418855],
       [ 0.38569499, -0.24211127, -0.56776213, ..., -0.04473627,
         0.        ,  0.39418855]])

#### Lasso prediction

In [48]:
X_holdout['pred_sale_price'] = lasso.predict(X_poly_holdout_sc)

#### Drop all features except the predicted sale price.

In [49]:
X_holdout = X_holdout.drop(features, axis=1)
X_holdout

Unnamed: 0,pred_sale_price
0,140351.570225
1,162999.073336
2,204388.130821
3,105268.854330
4,178616.006776
5,94144.752970
6,117075.741233
7,180113.740882
8,175189.966985
9,170666.557908


In [50]:
X_holdout.columns = ['SalePrice']

#### Complete the dataframe for Kaggle submission.

In [52]:
X_holdout['Id'] = df_to_pred1['Id']

In [53]:
X_holdout.set_index('Id', inplace=True)

#### Save it to the csv file.

In [55]:
X_holdout.to_csv('../my_tenth_submission')