# Bonus material

Added Thursday afternoon after lecture on pipelines and grid search.

This is a re-work of Model 8.5 from notebook 'Project 2', and relies on the work in that notebook for deciding on features to select in the model.

In [1]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
import sklearn.metrics as metrics
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
# read in cleaned data
housing = pd.read_csv('Data/train_clean.csv')
test = pd.read_csv('Data/test_clean.csv')

In [3]:
# set up class that does required feature selection and engineering on given DataFrame 

class FeatureGenerator(object):
    
    def __init__(self):
        # list of features with high correlation to sale price
        self.include_list = ['Overall Qual','Year Built','Year Remod/Add','Mas Vnr Area','Exter Qual','Bsmt Qual',
                            'Bsmt Exposure','BsmtFin SF 1','Total Bsmt SF','Heating QC','1st Flr SF','Gr Liv Area',
                            'Full Bath','Kitchen Qual','TotRms AbvGrd','Fireplaces','Fireplace Qu','Garage Finish',
                            'Garage Cars','Garage Area']
        
        # list of features to remove due to co-linearity
        self.remove_list = ['Exter Qual','1st Flr SF','TotRms AbvGrd','Fireplaces','Garage Cars'] 
        
    # required method so that class can be used in GridSearch
    def get_params(self, deep=True):
        return {}
    
    # required method so that class can be used in GridSearch
    def fit(self, X, y):
        return self   
    
    # function to be used within transform to add flags for specific columns
    def add_flags(self, features_df, full_df):
    
        flag_func = (lambda x: 1 if (x['Condition 1']=='PosA') or (x['Condition 1']=='PosB') or (x['Condition 2']=='PosA') or (x['Condition 2']=='PosB') else 0)
        
        # add flags for MS Zone
        for zone in housing['MS Zoning'].unique():
            features_df[zone] = full_df['MS Zoning'].map(lambda x: 1 if x==zone else 0)

        # add flags for Neighborhood
        for neighborhood in housing['Neighborhood'].unique():
            features_df[neighborhood] = full_df['Neighborhood'].map(lambda x: 1 if x==neighborhood else 0)

        # add flag for proximity to positive off-site feature
        features_df['Off-site flag'] = full_df.apply(flag_func, axis=1)
        
        return features_df
        
    def transform(self, X):         
        
        # remove features
        new_df = X[self.include_list].copy()
        new_df.drop(columns=self.remove_list, inplace=True)

        # add features
        self.add_flags(new_df, X)
        new_df['Exter Qual + Gr Liv Area'] = X['Exter Qual']*X['Gr Liv Area']
        new_df['Bsmt Qual + Gr Liv Area'] = X['Bsmt Qual']*X['Gr Liv Area']
        new_df['Kitchen Qual + Gr Liv Area'] = X['Kitchen Qual']*X['Gr Liv Area']

        return new_df

In [4]:
# split housing into train and test sets
X_train, X_test, y_train, y_test = train_test_split(housing.drop(columns='SalePrice', axis=1), housing['SalePrice'])

In [5]:
# set up pipeline
pipe = Pipeline([
    ('fg', FeatureGenerator()),
    ('ss', StandardScaler()),
    ('lasso', Lasso()),
])

In [18]:
# GridSearch for optimal alpha
# Re-run this cell, narrowing down on alpha each time

lasso_params = {
    'lasso__alpha':np.linspace(800,900,50)
}

# WARNING - this takes a long time
# when using pipe within GridSearch, all transformations will be run everytime
# need to look for a way around this, as output from fg,ss could be saved as only lasso needs iteration
## try using cached transformers: http://scikit-learn.org/stable/modules/pipeline.html
gs = GridSearchCV(pipe, param_grid=lasso_params, scoring='neg_mean_squared_error', cv=10)
gs.fit(X_train, y_train)

# print best alpha and best RMSE
print((-gs.best_score_)**0.5)
print(gs.best_params_)

33563.686851991355
{'lasso__alpha': 800.0}


In [19]:
# look at range of RMSE for these parameters
scores = pd.DataFrame(gs.grid_scores_)
best_alpha_CV = scores[scores['parameters']==gs.best_params_]['cv_validation_scores']

print('max: ',(-np.min(list(best_alpha_CV.values)))**0.5)
print('min: ',(-np.max(list(best_alpha_CV.values)))**0.5)

max:  54555.97759485246
min:  23822.11605767968




In [20]:
# hold-out set RMSE
(-gs.score(X_test, y_test))**0.5

27425.14112604708

In [28]:
# run model on real test set and save results
model_predict = gs.predict(test)

# # fit and save model
model_predict_df = pd.DataFrame(model_predict, columns=['SalePrice'], index=test['Id'].astype(int))
model_predict_df.to_csv('./Data/model_last_submission.csv')

In [None]:
# score: 40156 
# Note: this was only fitted on training data, unlike model 8.5