In [1]:
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('colheader_justify', 'left', 'display.max_colwidth', -1, 'display.max_columns', None, 'display.max_rows', None)

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from datetime import datetime
import os
from IPython.core.display import HTML

In [2]:
#pandas styling
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('colheader_justify', 'left', 'display.max_colwidth', -1, )

pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', None)  #or 1000
pd.set_option('display.max_rows', None)  #or 1000
pd.set_option('precision', 0)

In [3]:
import sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold, train_test_split, ShuffleSplit, GridSearchCV, cross_val_score
from sklearn.linear_model import LassoCV, RidgeCV, Lasso, Ridge, ElasticNet, ElasticNetCV
import xgboost as xgb
import lightgbm as lgbm

from scipy import stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p

In [4]:
train=pd.read_csv('...train_cleaned.csv')
test=pd.read_csv('...test_cleaned.csv')

In [5]:
# manage data types
cols = train.dtypes[train.dtypes=='object'].index.to_list()
cols.extend(['YearBuilt', 'YearRemodAdd', 'GarageYrBlt'])
for col in cols:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

In [6]:
# manage Id column
testid = test['Id'] # will be needed for a submussion
train.drop('Id', inplace=True, axis=1)
test.drop('Id', inplace=True, axis=1)

In [7]:
# save target data
y_train = train['SalePrice'].values 
train = train.drop('SalePrice', axis=1)

### transforming numerical

In [8]:
# transforming target
y_train = np.log1p(y_train)

In [9]:
def numcode_find_skew(df):
    nums = list(df.select_dtypes(include=[np.number]).columns.values)
    skews = df[nums].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
    skews = skews[skews>0.5]
    print('There are {} features should be Skewed'.format(len(skews)))
    print(skews)
    
    Lambda=0.15
    for col in skews.index:
        df[col] = boxcox1p(df[col], Lambda)

In [10]:
numcode_find_skew(train)

There are 13 features should be Skewed
EnclosedPorch   3.07
MasVnrArea      2.70
OpenPorchSF     2.38
LotArea         2.26
WoodDeckSF      1.43
MSSubClass      1.40
GrLivArea       1.12
HalfBath        0.67
TotRmsAbvGrd    0.65
LotFrontage     0.64
Fireplaces      0.64
TotalBsmtSF     0.58
BsmtFullBath    0.58
dtype: float64


In [11]:
def numcode_boxcox_skew(df):
    nums = list(df.select_dtypes(include=[np.number]).columns.values)
    skews = df[nums].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
    skews = skews[skews>0.5]
    for col in skews.index:
        df[col] = boxcox1p(df[col], 0.15)

In [12]:
numcode_boxcox_skew(train)
numcode_boxcox_skew(test)

In [13]:
numcode_find_skew(train)

There are 2 features should be Skewed
EnclosedPorch   2.07
HalfBath        0.54
dtype: float64


In [14]:
numcode_find_skew(test)

There are 3 features should be Skewed
EnclosedPorch   1.90
MasVnrArea      0.67
HalfBath        0.61
dtype: float64


In [15]:
# should be deleted
todel = ['EnclosedPorch', 'HalfBath', 'MasVnrArea']
train.drop(todel, axis=1, inplace=True)
test.drop(todel, axis=1, inplace=True)

### encoding categorical

In [16]:
def catcode(df):
    cols = df.dtypes[df.dtypes=='category'].index
    print('There are {} features from {} were encoded'.format(len(cols), df.shape[1]))
    for col in cols:
        tdf = pd.DataFrame(df[col])
        tdf = pd.get_dummies(tdf)
        tdf = tdf.drop(tdf.columns[0], axis=1)
        df = pd.concat([df, tdf], axis=1).reset_index(drop=True)
        df.drop(col, inplace=True, axis=1) #Delete actual column from dataframe
    print('New shape is {}'.format(df.shape))
    return df

In [17]:
train = catcode(train)
test = catcode(test)

There are 33 features from 47 were encoded
New shape is (1448, 518)
There are 33 features from 47 were encoded
New shape is (1459, 508)


In [18]:
# manage shapes 
set1 = set(train.columns.tolist())
set2 = set(test.columns.tolist())
todel = list(set1.difference(set2))
train.drop(todel, axis=1, inplace=True)
todel2 = list(set2.difference(set1))
test.drop(todel2, axis=1, inplace=True)

### modeling

In [19]:
#Defining cross validation strategy
cross_val = KFold(n_splits=10, shuffle=True, random_state=42)

#Define function to calculate rmse during training
def rmse_train(model, x, y):
    rmse = np.sqrt(-cross_val_score(model, x, y, scoring='neg_mean_squared_error', cv=cross_val, n_jobs=-1))
    return rmse.mean()

#Find rmse for prediction
def rmse_pred(y, y_pred):
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    return rmse

In [20]:
#Building Function for Submission
def Submission(prediction):
    df_pred = pd.DataFrame({'Id':testid, 'SalePrice':prediction})
    print('Sample Prediction:', prediction[:5])
    
    #Defining file name
    tday = datetime.today()
    tm = str(tday.date().day)+str(tday.date().month)+str('_')+str(tday.time().hour)+str(tday.time().minute)+str(tday.time().second)
    fn = '...Subm.csv'
    fn = str(fn)+str(tm)+str('.csv')
    
    #Saving prediction to csv
    df_pred['Id'] +=1
    df_pred.to_csv(fn, index=False)
    print('Submission file saved to', os.path.realpath(fn))

In [21]:
#Defining training inputs
X_train = train.values
y_train = y_train

#Define test inputs
X_test = test.values

X_train.shape, X_test.shape

((1448, 497), (1459, 497))

In [22]:
# Lasso Model
alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 2, 10, 20, 50]
lcv = LassoCV(alphas=alphas, random_state=42, cv=cross_val, n_jobs=-1, max_iter=10000)
lcv.fit(X_train, y_train)

#Optimum alpha value for lasso model
lcv.alpha_

0.0003

In [23]:
regressor_lasso = Lasso(alpha=0.0003, random_state=42)
regressor_lasso.fit(X_train,y_train)

Lasso(alpha=0.0003, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=42,
      selection='cyclic', tol=0.0001, warm_start=False)

In [24]:
#Review RMSE values for Lasso
print('Training RMSE: {:.4f} '.format(rmse_train(regressor_lasso, X_train, y_train)))
#Making prediction and review Test RMSE
print('Testing RMSE: {:.4f}'.format(rmse_pred(y_train, regressor_lasso.predict(X_train))))

Training RMSE: 0.1255 
Testing RMSE: 0.1042


In [25]:
#Scoring Lasso prediction
pred = regressor_lasso.predict(X_test)
pred = np.expm1(pred)
Submission(pred)

Sample Prediction: [111.86834267 148.55115234 184.93652996 194.9448483  194.31116059]
Submission file saved to /home/lana/Desktop/DS_blog/Outbox/HPriceSubm.csv1211_213434.csv


In [26]:
#Building Ridge Model
alphas2 = [10, 12, 16, 12.5, 17, 10.001]
rcv = RidgeCV(alphas=alphas, scoring='neg_mean_squared_error', cv=cross_val)
rcv.fit(X_train, y_train)
rcv.alpha_

10.0

In [27]:
regressor_ridge = Ridge(alpha=10, max_iter=10000, random_state=42)
regressor_ridge.fit(X_train,y_train)

Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=10000,
      normalize=False, random_state=42, solver='auto', tol=0.001)

In [28]:
pred_ridge = regressor_ridge.predict(X_train)

In [29]:
#Review Train RMSE values
print('Training RMSE:{:.4f}'.format(rmse_train(regressor_ridge, X_train, y_train)))
#Making prediction and review Test RMSE
print('Testing RMSE:{:.4f}'.format(rmse_pred(y_train, pred_ridge)))

Training RMSE:0.1262
Testing RMSE:0.1006


In [30]:
#Make Test Prediction
pred_ridge = regressor_ridge.predict(X_test)
pred_ridge = np.expm1(pred_ridge)
Submission(pred_ridge)

Sample Prediction: [111.77467992 145.70852176 188.28810181 199.10213971 179.13308696]
Submission file saved to /home/lana/Desktop/DS_blog/Outbox/HPriceSubm.csv1211_213441.csv


### final prediction

In [31]:
#Test both lasso and ridge
pred = regressor_lasso.predict(X_test)*0.5 + regressor_ridge.predict(X_test)*0.5
pred = np.expm1(pred)
Submission(pred)

Sample Prediction: [111.82150158 147.12301809 186.60483153 197.01258397 186.56866016]
Submission file saved to /home/lana/Desktop/DS_blog/Outbox/HPriceSubm.csv1211_213441.csv
