In [1]:
import pandas as pd
import missingno as msno
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler, PowerTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.metrics import mean_squared_error

In [2]:
combined_df = pd.read_csv('combined_df.csv')

In [3]:
new_train_data = combined_df.loc[combined_df['Label_train']==1].drop(columns=['Id','Label_test','Label_train'])
new_test_data = combined_df.loc[combined_df['Label_test']==1].drop(columns=['Id','Label_test','Label_train'])
X_train, X_test, y_train, y_test = train_test_split(new_train_data.drop('SalePrice', axis=1),new_train_data['SalePrice'], random_state=42)
y_train = np.log1p(y_train.values.ravel())

In [4]:
pre_precessing_pipeline = make_pipeline(RobustScaler(),
                                       )

X_train = pre_precessing_pipeline.fit_transform(X_train)
X_test = pre_precessing_pipeline.transform(X_test)

print(X_train.shape)
print(X_test.shape)

(1538, 238)
(513, 238)


# OLS

In [5]:
ols=LinearRegression()
ols.fit(X_train, y_train)
ols_scores = cross_val_score(ols, X_train, y_train, cv=100, n_jobs=-1,scoring='neg_mean_squared_error')
print(f'ols train R^2:\t\t {ols_scores.mean():.4f}')

y_pred = ols.predict(X_test)
ols_test_score = mean_squared_error(np.log1p(y_test),y_pred)
print(f'ols test R^2:\t\t {ols_test_score:.4f}')

ols train R^2:		 -0.0194
ols test R^2:		 0.0198


# RidgeCV

In [6]:
ridge_cv=RidgeCV(cv=100)
ridge_cv.fit(X_train, y_train)
ridge_cv_score = ridge_cv.score(X_train, y_train)
print(f'Ridge train R^2:\t {ridge_cv_score:.4f}')

ridge_cv_test_score = ridge_cv.score(X_test,np.log1p(y_test))
print(f'Ridge test R^2:\t\t {ridge_cv_test_score:.4f}')

Ridge train R^2:	 0.9429
Ridge test R^2:		 0.8907


In [7]:
lasso_cv=LassoCV(cv=100)
lasso_cv.fit(X_train, y_train)
lasso_cv_score = lasso_cv.score(X_train, y_train)
print(f'lasso train R^2:\t {lasso_cv_score:.4f}')

lasso_cv_test_score = lasso_cv.score(X_test,np.log1p(y_test))
print(f'lasso test R^2:\t\t {lasso_cv_test_score:.4f}')

lasso train R^2:	 0.8945
lasso test R^2:		 0.8898


In [8]:
print(f'ols {(ols_test_score-ols_scores.mean())/ols_scores.mean()*100:.2f}%')
print(f'ridge {(ridge_cv_test_score-ridge_cv_score)/ridge_cv_score*100:.2f}%')
print(f'lasso {(lasso_cv_test_score-lasso_cv_score)/lasso_cv_score*100:.2f}%')

ols -202.07%
ridge -5.53%
lasso -0.54%


# RMSE


In [9]:
y_pred = np.expm1(ols.predict(X_train))
ols_mse = mean_squared_error(np.expm1(y_train), y_pred)
train_mse = ols_mse**.5
print(f'train rmse\t\t{ols_mse**.5:,}')

y_pred = np.expm1(ols.predict(X_test))
ols_mse = mean_squared_error(y_test, y_pred)
test_mse = ols_mse**.5
print(f'test rmse\t\t{ols_mse**.5:,}')

print(f'delta of train: \t{(test_mse-train_mse)/train_mse*100:.2f}%')

train rmse		18,705.26873999436
test rmse		21,371.13367972558
delta of train: 	14.25%


In [10]:
y_pred = np.expm1(ridge_cv.predict(X_train))
ridge_cv_mse = mean_squared_error(np.expm1(y_train), y_pred)
train_mse = ridge_cv_mse**.5
print(f'train rmse\t\t{ridge_cv_mse**.5:,}')

y_pred = np.expm1(ridge_cv.predict(X_test))
ridge_cv_mse = mean_squared_error(y_test, y_pred)
test_mse = ridge_cv_mse**.5
print(f'test rmse\t\t{ridge_cv_mse**.5:,}')

print(f'delta of train: \t{(test_mse-train_mse)/train_mse*100:.2f}%')

train rmse		19,455.97368798812
test rmse		21,221.84207096002
delta of train: 	9.08%


In [11]:
y_pred = np.expm1(lasso_cv.predict(X_train))
lasso_cv_mse = mean_squared_error(np.expm1(y_train), y_pred)
train_mse = lasso_cv_mse**.5
print(f'train rmse\t\t{lasso_cv_mse**.5:,}')

y_pred = np.expm1(lasso_cv.predict(X_test))
lasso_cv_mse = mean_squared_error(y_test, y_pred)
test_mse = lasso_cv_mse**.5
print(f'test rmse\t\t{lasso_cv_mse**.5:,}')

print(f'delta of train: \t{(test_mse-train_mse)/train_mse*100:.2f}%')

train rmse		28,150.464591740198
test rmse		23,474.76088994825
delta of train: 	-16.61%
