Import Packages

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Model Version1 (Lasso & Ridge)

In [2]:
# import data version1
%store -r train_v1
%store -r test_v1

In [3]:
# Split train data into training set and testing set
X = train_v1.drop(['id', 'saleprice'], axis=1)
y = np.log(train_v1['saleprice'])

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
# Build a lr model
lr = LinearRegression()
lr = lr.fit(X_train, y_train)

In [6]:
# Build a lasso model
# I choose Lasso since I have so many features here
# to avoid overfitting, I want Lasso to zero out some unnecessary features
# Set up a list of Lasso alphas to check.
l_alphas = np.logspace(-3, 0, 100)

# Cross-validate over our list of Lasso alphas.
lasso_model = LassoCV(alphas=l_alphas, cv=5)

# Fit model using best ridge alpha!
lasso_model = lasso_model.fit(X_train, y_train)

In [7]:
# Set up a list of ridge alphas to check.
r_alphas = np.logspace(0, 5, 100)
# Generates 100 values equally between 0 and 5,
# then converts them to alphas between 10^0 and 10^5.

# Cross-validate over our list of ridge alphas.
ridge_model = RidgeCV(alphas=r_alphas, scoring='r2', cv=5)

# Fit model using best ridge alpha!
ridge_model = ridge_model.fit(X_train, y_train)

Evaluate the model performance

In [8]:
# cross validation
# check lr model
cross_val_score(lr, X_train, y_train, cv=5).mean()

-8.24000357289267e+21

In [9]:
# check lasso model
cross_val_score(lasso_model, X_train, y_train, cv=5).mean()

0.8564891506890475

In [10]:
# check ridge model
cross_val_score(ridge_model, X_train, y_train, cv=5).mean()

0.8654405198887831

In [11]:
# check r2 score for lr
# predict X_train and X_test
y_pred_train_lr = lr.predict(X_train)
y_pred_test_lr = lr.predict(X_test)
# check r^2 score on training and testing set
print(r2_score(y_train, y_pred_train_lr))
print(r2_score(y_test, y_pred_test_lr))

0.9366456534512959
-3.146552826714644e+21


In [12]:
# check r2 score for lasso
# predict X_train and X_test
y_pred_train_lasso = lasso_model.predict(X_train)
y_pred_test_lasso = lasso_model.predict(X_test)
# check r^2 score on training and testing set
print(r2_score(y_train, y_pred_train_lasso))
print(r2_score(y_test, y_pred_test_lasso))

0.9112326711407754
0.8913952676800688


In [13]:
# check r2 score for ridge
# predict X_train and X_test
y_pred_train_ridge = ridge_model.predict(X_train)
y_pred_test_ridge = ridge_model.predict(X_test)
# check r^2 score on training and testing set
print(r2_score(y_train, y_pred_train_ridge))
print(r2_score(y_test, y_pred_test_ridge))

0.9121363164363363
0.8848590813733684


In [14]:
# check MSE of the model
def MSE(y, y_hat):
    return np.mean((y-y_hat)**2)

# check MSE for lasso
print(MSE(y_train,y_pred_train_lasso))
print(MSE(y_test,y_pred_test_lasso))

0.014902695739292298
0.018812830094928568


In [15]:
# check MSE for ridge
print(MSE(y_train,y_pred_train_ridge))
print(MSE(y_test,y_pred_test_ridge))

0.014750987322816883
0.019945047447064948


Predict the test data

In [16]:
# I decided to use lasso model since it performs much better than lr
test_predict= test_v1.drop(['id'], axis=1)

In [17]:
# use lasso to predict
predict_result_log_lasso = lasso_model.predict(test_predict)
predict_result_lasso = np.exp(predict_result_log_lasso)

In [18]:
# take a peak at the first 10 price
predict_result_lasso[0:10]

array([118064.0234087 , 159662.84385232, 217873.16530541,  99054.72494972,
       175490.70257947,  94538.36747051, 109772.18974693, 157373.80855872,
       178043.36929289, 161913.21907889])

In [19]:
# use ridge to predict
predict_result_log_ridge = ridge_model.predict(test_predict)
predict_result_ridge = np.exp(predict_result_log_ridge)

In [31]:
# take a peak at the first 10 price
predict_result_ridge[0:10]

array([107142.26318791, 170659.2012647 , 221708.83919119, 101661.37405823,
       177077.29983749,  97277.03181634, 109903.55855653, 159966.90997439,
       171890.78364619, 163136.79556556])

In [20]:
test_v1 = test_v1.rename({'id': 'Id'}, axis=1)
test_v1= test_v1.reset_index()

In [21]:
# result for lasso

In [22]:
predict_result_v1_lasso = pd.DataFrame(predict_result_lasso, columns=['SalePrice'])

In [23]:
kg_v1_lasso = pd.DataFrame({'Id':test_v1['Id'], 'saleprice': predict_result_v1_lasso['SalePrice']})

In [24]:
kg_v1_lasso.to_csv('./datasets/kaggle_submission_v1_lasso.csv', index=False)

In [25]:
# result for ridge

In [26]:
predict_result_v1_ridge = pd.DataFrame(predict_result_ridge, columns=['SalePrice'])

In [27]:
kg_v1_ridge = pd.DataFrame({'Id':test_v1['Id'], 'saleprice': predict_result_v1_ridge['SalePrice']})

In [28]:
kg_v1_ridge.to_csv('./datasets/kaggle_submission_v1_ridge.csv', index=False)

In [29]:
kg_v1_lasso.head()

Unnamed: 0,Id,saleprice
0,2658,118064.023409
1,2718,159662.843852
2,2414,217873.165305
3,1989,99054.72495
4,625,175490.702579


In [30]:
kg_v1_ridge.head()

Unnamed: 0,Id,saleprice
0,2658,107142.263188
1,2718,170659.201265
2,2414,221708.839191
3,1989,101661.374058
4,625,177077.299837
