# 1) Import packages

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import Lasso

from sklearn.metrics import mean_squared_error
from math import sqrt

from sklearn.externals import joblib



# 2) Load Data

In [2]:
house_df = pd.read_csv('data/train.csv')
print(house_df.shape)
house_df.head()

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# 3) Train-Test Split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(house_df, house_df.SalePrice, test_size=0.1, random_state=0)

X_train.shape, X_test.shape

((1314, 81), (146, 81))

# 4) Selected Features

In [4]:
features = pd.read_csv('data/selected_features.csv')
features = [f for f in features.iloc[:,0]]+['LotFrontage']
features

['MSSubClass',
 'MSZoning',
 'Neighborhood',
 'OverallQual',
 'OverallCond',
 'YearRemodAdd',
 'RoofStyle',
 'MasVnrType',
 'BsmtQual',
 'BsmtExposure',
 'HeatingQC',
 'CentralAir',
 '1stFlrSF',
 'GrLivArea',
 'BsmtFullBath',
 'KitchenQual',
 'Fireplaces',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageCars',
 'PavedDrive',
 'LotFrontage']

## 4.1) Missing Categorical values 

In [5]:
vars_with_na = [var for var in features if X_train[var].isnull().sum()>0 and X_train[var].dtypes=='object']

for var in vars_with_na:
    print("{:12s} is missing {:5.2f}% of its values.".format(var, X_train[var].isnull().sum()/X_train.shape[0]*100))

MasVnrType   is missing  0.46% of its values.
BsmtQual     is missing  2.44% of its values.
BsmtExposure is missing  2.51% of its values.
FireplaceQu  is missing 47.26% of its values.
GarageType   is missing  5.63% of its values.
GarageFinish is missing  5.63% of its values.


Want to fill these NAs with a generic 'missing' tag

In [6]:
def fill_cat_na(df, var_list):
    """
    Replaces missing categorical features with the tag 'missing'
    Inputs:
        df: A Dataframe containing the missing values
        var_list: A list containing the columns that need to be inspected
    Outputs: A dataframe with hopefully no NA's in the columns enumerated 
             in the var_list
    """
    
    
    X = df.copy()
    X[var_list] = df[var_list].fillna('Missing')
    return X

In [7]:
X_train = fill_cat_na(X_train, vars_with_na)
X_test = fill_cat_na(X_test, vars_with_na)

X_train[vars_with_na].isnull().sum().sum()

0

 ## 4.2) Missing Continuous Values

In [8]:
vars_with_na = [var for var in features if X_train[var].isnull().sum()>0 and X_train[var].dtypes != 'object']

for var in vars_with_na:
    print("{} is missing {:.2f}% of its values.".format(var, X_train[var].isnull().sum()/X_train.shape[0]*100))

LotFrontage is missing 17.73% of its values.


Going to impute the missing values for the continuous variables with its mode(the most common occuring value). And then save a dictionary of thsese mode values in a npy file so that they can be called for the model in production.

In [9]:
mode_var_dict = dict()

for var in vars_with_na:
    mode_val = X_train[var].mode()[0]
    mode_var_dict[var] = mode_val
    
    X_train[var].fillna(mode_val, inplace=True)
    X_test[var].fillna(mode_val, inplace=True)
    
np.save('data/mode_var_dict.npy', mode_var_dict)
X_train[vars_with_na].isnull().sum()

LotFrontage    0
dtype: int64

## 4.3 Temporal Variables

One of the lasso one of the manipulated time features was chosen 'YearRemodAdd' so we have to make this variable

In [10]:
def elapsed_year(df, var):
    """
    Captures elapsed year from year sold and some other past event.
    
    """
    df[var] = df['YrSold'] - df[var]
    return df

In [11]:
X_train = elapsed_year(X_train, 'YearRemodAdd')
X_test = elapsed_year(X_test, 'YearRemodAdd')

## 4.4 Numerical Variables Log transformation

In [12]:
for var in ['LotFrontage', '1stFlrSF', 'GrLivArea', 'SalePrice']:
    X_train[var] = np.log(X_train[var])
    X_test[var] = np.log(X_test[var])

## 4.5 Categorical Features

We want to eliminate all rare categories in our categorical features

In [13]:
cat_vars = [var for var in features if X_train[var].dtypes=='object']
cat_vars

['MSZoning',
 'Neighborhood',
 'RoofStyle',
 'MasVnrType',
 'BsmtQual',
 'BsmtExposure',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'PavedDrive']

In [14]:
def find_freq_labels(df, var, rare_pct):
    df = df.copy()
    tmp = df.groupby(var).count()['SalePrice']/len(df)
    return tmp[tmp>rare_pct].index

freq_labels_dict = dict()

for var in cat_vars:
    frequent_1s = find_freq_labels(X_train, var, 0.01)
    
    freq_labels_dict[var] = frequent_1s
    
    X_train[var] = np.where(X_train[var].isin(frequent_1s), X_train[var], 'Rare')
    X_test[var] = np.where(X_test[var].isin(frequent_1s), X_test[var], 'Rare')

np.save('data/FreqLabels.npy', freq_labels_dict)

In [15]:
freq_labels_dict

{'MSZoning': Index(['FV', 'RH', 'RL', 'RM'], dtype='object', name='MSZoning'),
 'Neighborhood': Index(['Blmngtn', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor',
        'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NWAmes',
        'NoRidge', 'NridgHt', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW',
        'Somerst', 'StoneBr', 'Timber'],
       dtype='object', name='Neighborhood'),
 'RoofStyle': Index(['Gable', 'Hip'], dtype='object', name='RoofStyle'),
 'MasVnrType': Index(['BrkFace', 'None', 'Stone'], dtype='object', name='MasVnrType'),
 'BsmtQual': Index(['Ex', 'Fa', 'Gd', 'Missing', 'TA'], dtype='object', name='BsmtQual'),
 'BsmtExposure': Index(['Av', 'Gd', 'Missing', 'Mn', 'No'], dtype='object', name='BsmtExposure'),
 'HeatingQC': Index(['Ex', 'Fa', 'Gd', 'TA'], dtype='object', name='HeatingQC'),
 'CentralAir': Index(['N', 'Y'], dtype='object', name='CentralAir'),
 'KitchenQual': Index(['Ex', 'Fa', 'Gd', 'TA'], dtype='object', name='KitchenQual'),
 'Fireplace

We also want to change these strings into ordinal categorical features.

The categories will be ordinal in that their category number will be assigned by the mean of the sale price for that category.

In [16]:
def replace_categories(train, test, var, target):
    train = train.copy()
    test = test.copy()
    
    ordered_labels = train.groupby([var]).mean()[target].sort_values().index
    ordinal_labels = {l:i for i, l in enumerate(ordered_labels,0)}
    
    train[var] = train[var].map(ordinal_labels)
    test[var] = test[var].map(ordinal_labels)
    
    return ordinal_labels, train, test

In [17]:
ordinal_label_dict = dict()
for var in cat_vars:
    ordinal_label, X_train, X_test = replace_categories(X_train, X_test, var, "SalePrice")
    ordinal_label_dict[var] = ordinal_label
    
np.save('data/OrdinalLabels.npy', ordinal_label_dict)

In [18]:
ordinal_label_dict

{'MSZoning': {'Rare': 0, 'RM': 1, 'RH': 2, 'RL': 3, 'FV': 4},
 'Neighborhood': {'IDOTRR': 0,
  'MeadowV': 1,
  'BrDale': 2,
  'Edwards': 3,
  'BrkSide': 4,
  'OldTown': 5,
  'Sawyer': 6,
  'SWISU': 7,
  'NAmes': 8,
  'Mitchel': 9,
  'SawyerW': 10,
  'Rare': 11,
  'NWAmes': 12,
  'Gilbert': 13,
  'Blmngtn': 14,
  'CollgCr': 15,
  'Crawfor': 16,
  'ClearCr': 17,
  'Somerst': 18,
  'Timber': 19,
  'StoneBr': 20,
  'NridgHt': 21,
  'NoRidge': 22},
 'RoofStyle': {'Gable': 0, 'Rare': 1, 'Hip': 2},
 'MasVnrType': {'None': 0, 'Rare': 1, 'BrkFace': 2, 'Stone': 3},
 'BsmtQual': {'Missing': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
 'BsmtExposure': {'Missing': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4},
 'HeatingQC': {'Rare': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
 'CentralAir': {'N': 0, 'Y': 1},
 'KitchenQual': {'Fa': 0, 'TA': 1, 'Gd': 2, 'Ex': 3},
 'FireplaceQu': {'Po': 0, 'Missing': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
 'GarageType': {'Missing': 0,
  'Rare': 1,
  'Detchd': 2,
  'Basment': 3,
  '

One last check to make sure all features have no NAs

In [19]:
[var for var in features if X_train[var].isnull().sum()>0]

[]

In [20]:
[var for var in features if X_test[var].isnull().sum()>0]

[]

## 4.6 Feature Scaling

In [21]:
y_train = X_train['SalePrice']
y_test = X_test['SalePrice']

In [23]:
scaler = MinMaxScaler()
scaler.fit(X_train[features])

joblib.dump(scaler, 'data/scaler.pkl')

['data/scaler.pkl']

In [25]:
X_train = pd.DataFrame(scaler.transform(X_train[features]), columns=features)
X_test = pd.DataFrame(scaler.transform(X_test[features]), columns=features)

# 5 Train Model

In [28]:
lin_model = Lasso(alpha=0.005, random_state=0)
lin_model.fit(X_train, y_train)

joblib.dump(lin_model, 'data/lasso_regression.pkl')

['data/lasso_regression.pkl']

In [29]:
pred = lin_model.predict(X_train)
print("Train MSE: {:.2f}".format(mean_squared_error(np.exp(y_train), np.exp(pred))))
print("Train RMSE: {:.2f}".format(sqrt(mean_squared_error(np.exp(y_train), np.exp(pred)))))

Train MSE: 1087435415.44
Train RMSE: 32976.29


In [30]:
pred = lin_model.predict(X_test)
print("Test MSE: {:.2f}".format(mean_squared_error(np.exp(y_test), np.exp(pred))))
print("Test RMSE: {:.2f}".format(sqrt(mean_squared_error(np.exp(y_test), np.exp(pred)))))

Test MSE: 1405259552.26
Test RMSE: 37486.79
