# Feature Engineering

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

  return f(*args, **kwds)


In [2]:
house_df = pd.read_csv('data/train.csv')
print(house_df.shape)
house_df.head()

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# Separate data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(house_df, house_df.SalePrice, test_size=0.1, random_state=0)
X_train.shape, X_test.shape

((1314, 81), (146, 81))

### Categorical missing values replacement

In [36]:
#Get all categorical columns with at least one missing variable
na_vars = [var for var in house_df.columns if X_train[var].isnull().sum()>0 and X_train[var].dtypes=='object']

for var in na_vars:
    print("{:15s} has {:.2f}% of its values missing".format(var, X_train[var].isnull().mean()*100, str()))

Alley           has 93.84% of its values missing
MasVnrType      has 0.46% of its values missing
BsmtQual        has 2.44% of its values missing
BsmtCond        has 2.44% of its values missing
BsmtExposure    has 2.51% of its values missing
BsmtFinType1    has 2.44% of its values missing
BsmtFinType2    has 2.51% of its values missing
Electrical      has 0.08% of its values missing
FireplaceQu     has 47.26% of its values missing
GarageType      has 5.63% of its values missing
GarageFinish    has 5.63% of its values missing
GarageQual      has 5.63% of its values missing
GarageCond      has 5.63% of its values missing
PoolQC          has 99.54% of its values missing
Fence           has 81.43% of its values missing
MiscFeature     has 96.12% of its values missing


In [39]:
#replace NA in missing categorical features
def fill_cat_na(df, var_list):
    data= df.copy()
    data[var_list] = data[var_list].fillna('Missing')
    return data

In [41]:
X_train = fill_cat_na(X_train, na_vars)
X_test = fill_cat_na(X_test, na_vars)

X_train[na_vars].isnull().sum()

Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
Electrical      0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

### Numerical missing value replacement

In [42]:
#Get all numerical columns with at least one missing variable
na_vars = [var for var in house_df.columns if X_train[var].isnull().sum()>0 and X_train[var].dtypes!='object']

for var in na_vars:
    print("{:15s} has {:.2f}% of its values missing".format(var, X_train[var].isnull().mean()*100, str()))

LotFrontage     has 17.73% of its values missing
MasVnrArea      has 0.46% of its values missing
GarageYrBlt     has 5.63% of its values missing


In [43]:
#replace missing values with the mode
for var in na_vars:
    
    mode_val = X_train[var].mode()[0]
    
    X_train[var+'_na'] = np.where(X_train[var].isnull(), 1, 0)
    X_train[var].fillna(mode_val, inplace=True)
    
    X_test[var+'_na'] = np.where(X_test[var].isnull(), 1, 0)
    X_test[var].fillna(mode_val, inplace=True)
    
X_train[na_vars].isnull().sum()

LotFrontage    0
MasVnrArea     0
GarageYrBlt    0
dtype: int64

In [46]:
#New categorical variable to indicate missing values
X_train[['LotFrontage_na', 'MasVnrArea_na', 'GarageYrBlt_na']]

Unnamed: 0,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
930,0,0,0
656,0,0,0
45,0,0,0
1348,1,0,0
55,0,0,0
...,...,...,...
763,0,0,0
835,0,0,0
1216,0,0,0
559,1,0,0


In [48]:
#Make sure nulls in test set are gone
[var for var in na_vars if X_test[var].isnull().sum()>0]

[]

## Temporal Variable
I want to replace the raw year with duration of time from temporally marked events in years

In [49]:
def elapsed_year(df, var):
    df[var] = df['YrSold'] - df[var]
    return df

In [50]:
for var in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
    X_train = elapsed_year(X_train, var)
    X_test = elapsed_year(X_test, var)

In [52]:
#Make sure no nulls were created in test set
[var for var in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt'] if X_test[var].isnull().sum()>0]

[]

## Numerical Variables
Want to take numerical values with no zeros in them and perform a log transformation

In [53]:
for var in ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']:
    X_train[var] = np.log(X_train[var])
    X_test[var] = np.log(X_test[var])

In [56]:
[var for var in ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice'] if X_test[var].isnull().sum()>0]

[]

In [55]:
[var for var in ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice'] if X_train[var].isnull().sum()>0]

[]

## Categorical Variables

In [58]:
cat_vars = [var for var in X_train.columns if X_train[var].dtypes=='object']

In [59]:
def find_frequent_labels(data, var, rare_pct):
    """
    Designed to find labels that occur a certain % in the dataset
    
    inputs:
    data: A dataframe(df)
    var: A variable name from the same dataframe(str)
    rare_pct: A percentage that is a boundry for whether a categorical
    variables occurence is rare or not(float)
    outputs:
    a string showing which label within each category appears below the rare_pct threshold
    """
    data = data.copy()
    tmp = data.groupby(var)['SalePrice'].count()/len(data)
    return tmp[tmp>rare_pct].index

for var in cat_vars:
    ferquent