# Ames Housing Project

## Project Challenge Statement

#### Goal: Predict the price of homes at sale for the Aimes Iowa Housing dataset. 

Two files used to build the model. 

- train_data_cleanna.csv -- this data contains all of the training data with no missing values and outliers
- test_data_cleanna.csv -- this data contains all of the testing data with no missing values and outliers

#### Prediction Files 
- LR_first_submit.csv -- this file will be the baseline submisstion of Linear Regression Model
- lasso_first_submit.csv -- this file will be the first submission using Lasso CV
- ridge_first_submit.csv -- this file will be the first submission using Ridge CV
- esn_first_submit.csv -- this file will be the first submission using ElasticNet CV


## Table of Contents 

This Notebook is broken down into different sections for analysis purpose. The following links are connected to differenct section within the Notebook for simple navigation. 

### Contents:
- [Linear Regression Model](#Linear-Regression-Model)
- [Lasso Model](#Lasso-Model)
- [Ridge Model](#Ridge-Model)
- [ElasticNet Model](#ElasticNet-Model)

In [297]:
#Eliminate warnings 
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [298]:
# Library imports
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV, LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, PowerTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_regression, RFECV
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score


np.random.seed(42)
%matplotlib inline

In [299]:
from functions import *

In [300]:
#import Data 
train = pd.read_csv('../datasets/train.csv')

clean_train_data = pd.read_csv('../datasets/train_data_clean.csv')
clean_test_data = pd.read_csv('../datasets/test_data_clean.csv')

base_train_data = pd.read_csv('../datasets/train_data_cleanna.csv')
base_test_data = pd.read_csv('../datasets/test_data_cleanna.csv')


In [301]:
#Extract common values in X_train and X_test values
p_cols = ['Lot Frontage', 'Lot Area', 'Overall Qual', 'Overall Cond',
       'Year Built', 'Mas Vnr Area', 'BsmtFin SF 1', 'Total Bsmt SF',
       '1st Flr SF', '2nd Flr SF', 'Bedroom AbvGr', 'Kitchen AbvGr',
       'Garage Area', 'Wood Deck SF', 'Screen Porch', 'Neighborhood_Edwards',
       'Neighborhood_Gilbert', 'Neighborhood_NAmes',
       'Neighborhood_NWAmes', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt',
       'Neighborhood_OldTown', 'Neighborhood_SWISU', 'Neighborhood_Sawyer',
       'Neighborhood_StoneBr', 'Exterior 1st_BrkFace',
       'Garage Cond_Ex']
X_dummy = ext_cat_features(base_train_data)

## Linear Regression Model 
build a linear regression model as the baseline model for reference

### Fit Linear Regression Model For Prediction with Selected Columns

In [302]:
X = pd.get_dummies(base_train_data, columns = X_dummy).drop(columns = ['Unnamed: 0', 'Id','PID', 'SalePrice'])
y = base_train_data['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [303]:
lr = LinearRegression()
lr.fit(X_train[p_cols], y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [304]:
print('train', lr.score(X_train[p_cols], y_train))
print('test', lr.score(X_test[p_cols], y_test))

train 0.8847947975136864
test 0.897719937191588


In [305]:
#Manipulate Test data frame for prediction
X_predict = pd.get_dummies(base_test_data, columns = X_dummy).drop(columns = ['Unnamed: 0', 'Id','PID'])

In [306]:
lr_predict = lr.predict(X_predict[p_cols])
lr_predict.shape

(879,)

In [307]:
#save df procedure
id_df = clean_test_data[['Id']]
LR_firstsub_test_predict = pd.DataFrame(lr_predict, columns = ['SalePrice'])
df = id_df.join(LR_firstsub_test_predict)
df.columns = ['Id', "SalePrice"]
df = df.set_index('Id')
df.shape

(879, 1)

In [308]:
df.to_csv('../datasets/LR_first_submit.csv')

## Lasso Model

### Polynomial Transformation with Lasso Model 

In [237]:
X = pd.get_dummies(base_train_data, columns = X_dummy).drop(columns = ['Unnamed: 0', 'Id','PID', 'SalePrice'])
y = base_train_data['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [280]:
#Lasso pipeline 
lasso = Pipeline([
    ('poly', PolynomialFeatures(include_bias=False)),
    ('ss', StandardScaler()),
#     ('kbest',SelectKBest(f_regression, k = 150)),
    ('lasso', LassoCV())
])

In [281]:
lasso.fit(X_train[p_cols], y_train)
lasso_predict = lasso.predict(X_test[p_cols])
print("train score", lasso.score(X_train[p_cols], y_train))
print('test score', lasso.score(X_test[p_cols], y_test))
print("R2 score", r2_score(lasso_predict, y_test));

train score 0.9277912154428029
test score 0.9305381689292688
R2 score 0.922717879679237


In [284]:
#save df procedure
id_df = clean_test_data[['Id']]

lasso_firstsub_test_predict = lasso.predict(X_predict[p_cols])

lasso_firstsub_test_predict = pd.DataFrame(lasso_firstsub_test_predict, columns = ['SalePrice'])

df = id_df.join(lasso_firstsub_test_predict)

df.columns = ['Id', "SalePrice"]

df = df.set_index('Id')

df.to_csv('../datasets/lasso_first_submit.csv')

## Ridge Model

### Polynomial Transformation with Ridge Model 

In [285]:
X = pd.get_dummies(base_train_data, columns = X_dummy).drop(columns = ['Unnamed: 0', 'Id','PID', 'SalePrice'])
y = base_train_data['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [286]:
#Lasso pipeline 
ridge = Pipeline([
    ('poly', PolynomialFeatures(include_bias=False)),
    ('ss', StandardScaler()),
#     ('kbest',SelectKBest(f_regression, k = 150)),
    ('lasso', RidgeCV())
])

In [287]:
ridge.fit(X_train[p_cols], y_train)
ridge_predict = ridge.predict(X_test[p_cols])
print("train score", ridge.score(X_train[p_cols], y_train))
print('test score', ridge.score(X_test[p_cols], y_test))
print("R2 score", r2_score(ridge_predict, y_test));

train score 0.9471356986647035
test score 0.8994726156836211
R2 score 0.8923849884051176


In [288]:
#save df procedure
id_df = clean_test_data[['Id']]

ridge_firstsub_test_predict = ridge.predict(X_predict[p_cols])

ridge_firstsub_test_predict = pd.DataFrame(ridge_firstsub_test_predict, columns = ['SalePrice'])

df = id_df.join(ridge_firstsub_test_predict)

df.columns = ['Id', "SalePrice"]

df = df.set_index('Id')

df.to_csv('../datasets/ridge_first_submit.csv')

## ElasticNet Model

### Polynomial Transformation with ElasticNet Model 

In [292]:
X = pd.get_dummies(base_train_data, columns = X_dummy).drop(columns = ['Unnamed: 0', 'Id','PID', 'SalePrice'])
y = base_train_data['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [293]:
#Lasso pipeline 
esn = Pipeline([
    ('poly', PolynomialFeatures(include_bias=False)),
    ('ss', StandardScaler()),
#     ('kbest',SelectKBest(f_regression, k = 150)),
    ('ESN', ElasticNetCV())
])

In [294]:
esn.fit(X_train[p_cols], y_train)
esn_predict = esn.predict(X_test[p_cols])
print("train score", esn.score(X_train[p_cols], y_train))
print('test score', esn.score(X_test[p_cols], y_test))
print("R2 score", r2_score(esn_predict, y_test));

train score 0.5266116564754999
test score 0.5368130253976637
R2 score -2.4611002380747906


In [296]:
#save df procedure
id_df = clean_test_data[['Id']]

esn_firstsub_test_predict = esn.predict(X_predict[p_cols])

esn_firstsub_test_predict = pd.DataFrame(esn_firstsub_test_predict, columns = ['SalePrice'])

df = id_df.join(esn_firstsub_test_predict)

df.columns = ['Id', "SalePrice"]

df = df.set_index('Id')

df.to_csv('../datasets/esn_first_submit.csv')