Import Packages

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Model Version3 (Lasso & Ridge)

In [2]:
# import data version1
%store -r train_v3
%store -r test_v3

In [3]:
# Split train data into training set and testing set
X = train_v3.drop(['id', 'saleprice'], axis=1)
y = np.log(train_v3['saleprice'])

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=32)

In [5]:
# Build a lr model
lr = LinearRegression()
lr = lr.fit(X_train, y_train)

In [6]:
# Build a lasso model
# I choose Lasso since I have so many features here
# to avoid overfitting, I want Lasso to zero out some unnecessary features
# Set up a list of Lasso alphas to check.
l_alphas = np.logspace(-3, 0, 300)

# Cross-validate over our list of Lasso alphas.
lasso_model = LassoCV(alphas=l_alphas, cv=8)

# Fit model using best ridge alpha!
lasso_model = lasso_model.fit(X_train, y_train)

In [7]:
# Set up a list of ridge alphas to check.
r_alphas = np.logspace(-3, 5, 100)
# Generates 100 values equally between 0 and 5,
# then converts them to alphas between 10^0 and 10^5.

# Cross-validate over our list of ridge alphas.
ridge_model = RidgeCV(alphas=r_alphas, scoring='neg_mean_squared_error', cv=8)

# Fit model using best ridge alpha!
ridge_model = ridge_model.fit(X_train, y_train)

Evaluate the model performance

In [8]:
# cross validation
# check lr model
cross_val_score(lr, X_train, y_train, cv=5).mean()

-2.5377341816608878e+23

In [9]:
# check lasso model
cross_val_score(lasso_model, X_train, y_train, cv=5).mean()

0.8943979961172623

In [10]:
# check ridge model
cross_val_score(ridge_model, X_train, y_train, cv=5).mean()

0.8897965576285257

In [11]:
# check r2 score for lr
# predict X_train and X_test
y_pred_train_lr = lr.predict(X_train)
y_pred_test_lr = lr.predict(X_test)
# check mse score on training and testing set
print(r2_score(y_train, y_pred_train_lr))
print(r2_score(y_test, y_pred_test_lr))

0.9442947087308642
-2.1100708387500884e+18


In [12]:
# check r2 score for lasso
# predict X_train and X_test
y_pred_train_lasso = lasso_model.predict(X_train)
y_pred_test_lasso = lasso_model.predict(X_test)
# check r^2 score on training and testing set
print(r2_score(y_train, y_pred_train_lasso))
print(r2_score(y_test, y_pred_test_lasso))

0.9366560806260588
0.806067636905483


In [13]:
# check r2 score for ridge
# predict X_train and X_test
y_pred_train_ridge = ridge_model.predict(X_train)
y_pred_test_ridge = ridge_model.predict(X_test)
# check r^2 score on training and testing set
print(r2_score(y_train, y_pred_train_ridge))
print(r2_score(y_test, y_pred_test_ridge))

0.9352367643491876
0.8342616405158169


In [14]:
# check MSE of the model
def MSE(y, y_hat):
    return np.mean((y-y_hat)**2)

# check MSE for lasso
print(MSE(y_train,y_pred_train_lasso))
print(MSE(y_test,y_pred_test_lasso))

0.01114064566610851
0.028959039938663653


In [15]:
# check MSE for ridge
print(MSE(y_train,y_pred_train_ridge))
print(MSE(y_test,y_pred_test_ridge))

0.011390268674678884
0.024748957291526718


Predict the test data

In [16]:
# I decided to use lasso model since it performs much better than lr
test_predict= test_v3.drop(['id'], axis=1)

In [17]:
# use lasso to predict
predict_result_log_lasso = lasso_model.predict(test_predict)
predict_result_lasso = np.exp(predict_result_log_lasso)

In [18]:
# take a peak at the first 10 price
predict_result_lasso[0:10]

array([124028.91245004, 157181.49646276, 220344.84628323,  98612.9509834 ,
       170582.36266295,  91348.49828454, 103871.71319044, 149808.52158808,
       182543.53751982, 166345.00127687])

In [19]:
# use ridge to predict
predict_result_log_ridge = ridge_model.predict(test_predict)
predict_result_ridge = np.exp(predict_result_log_ridge)

In [20]:
# take a peak at the first 10 price
predict_result_ridge[0:10]

array([112787.90323538, 161955.79071282, 228235.52072044,  97663.05129859,
       175065.06258879,  91095.01348188, 102528.96071629, 149828.30006124,
       168834.63485489, 165267.78507502])

In [21]:
test_v3 = test_v3.rename({'id': 'Id'}, axis=1)
test_v3= test_v3.reset_index()

In [22]:
# result for lasso

In [23]:
predict_result_v3_lasso = pd.DataFrame(predict_result_lasso, columns=['SalePrice'])

In [24]:
kg_v3_lasso = pd.DataFrame({'Id':test_v3['Id'], 'saleprice': predict_result_v3_lasso['SalePrice']})

In [25]:
kg_v3_lasso.to_csv('./datasets/kaggle_submission_v3_lasso.csv', index=False)

In [26]:
# result for ridge

In [27]:
predict_result_v3_ridge = pd.DataFrame(predict_result_ridge, columns=['SalePrice'])

In [28]:
kg_v3_ridge = pd.DataFrame({'Id':test_v3['Id'], 'saleprice': predict_result_v3_ridge['SalePrice']})

In [29]:
kg_v3_ridge.to_csv('./datasets/kaggle_submission_v3_ridge.csv', index=False)

In [30]:
kg_v3_lasso.head()

Unnamed: 0,Id,saleprice
0,2658,124028.91245
1,2718,157181.496463
2,2414,220344.846283
3,1989,98612.950983
4,625,170582.362663


In [31]:
kg_v3_ridge.head()

Unnamed: 0,Id,saleprice
0,2658,112787.903235
1,2718,161955.790713
2,2414,228235.52072
3,1989,97663.051299
4,625,175065.062589
