In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
#from fastaiold.structured import *

In [3]:
import numpy as np
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype


import matplotlib.pyplot as plt
import math
import seaborn as sns
sns.set(style="darkgrid")

#from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold

import string
import warnings
warnings.filterwarnings('ignore')

In [4]:
PATH = "../../../data/house_pricing/"

In [5]:
df_train=pd.read_csv(f'{PATH}train.csv')#, index_col='Id')
df_test=pd.read_csv(f'{PATH}test.csv')#, index_col='Id')

### Y (target value) to Log, as stated at Kaggle Evaluation page

In [6]:
# for the purpose of evaluation of current competition
#df_train.SalePrice = np.log1p(df_train.SalePrice)
df_train.SalePrice = np.log1p(df_train.SalePrice)

In [7]:
print('Number of Training Examples = {}'.format(df_train.shape[0]))
print('Number of Test Examples = {}\n'.format(df_test.shape[0]))
print('Training X Shape = {}'.format(df_train.shape))
print('Training y Shape = {}\n'.format(df_train['SalePrice'].shape[0]))
print('Test X Shape = {}'.format(df_test.shape))
print('Test y Shape = {}\n'.format(df_test.shape[0]))
#print(df_train.columns)
#print(df_test.columns)

Number of Training Examples = 1460
Number of Test Examples = 1459

Training X Shape = (1460, 81)
Training y Shape = 1460

Test X Shape = (1459, 80)
Test y Shape = 1459



In [None]:
#print(df_train.info())
#df_train.sample(3)
#print(df_test.info())
#df_test.sample(3)

# Dealing with Outliers

In [None]:
fig, ax = plt.subplots()
ax.scatter(x = df_train['GrLivArea'], y = df_train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
#plt.show()

### -> To delete outliers

In [None]:
#Deleting outliers
df_train = df_train.drop(df_train[(df_train['GrLivArea']>4000) & (df_train['SalePrice']<300000)].index)

#Check the graphic again
fig, ax = plt.subplots()
ax.scatter(df_train['GrLivArea'], df_train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
#plt.show()

In [None]:
#Correlations with Target value

# Features engineering

In [8]:
#check the numbers of samples and features
print("The train data size before dropping Id feature is : {} ".format(df_train.shape))
print("The test data size before dropping Id feature is : {} ".format(df_test.shape))

#Save the 'Id' column
train_ID = df_train['Id']
test_ID = df_test['Id']

#Now drop the  'Id' colum since it's unnecessary for  the prediction process.
df_train.drop("Id", axis = 1, inplace = True)
df_test.drop("Id", axis = 1, inplace = True)

#check again the data size after dropping the 'Id' variable
print("\nThe train data size after dropping Id feature is : {} ".format(df_train.shape)) 
print("The test data size after dropping Id feature is : {} ".format(df_test.shape))


The train data size before dropping Id feature is : (1460, 81) 
The test data size before dropping Id feature is : (1459, 80) 

The train data size after dropping Id feature is : (1460, 80) 
The test data size after dropping Id feature is : (1459, 79) 


## Correlation

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(df_train.corr())
#plt.plot()

In [None]:
"""
fig, axs = plt.subplots(nrows=2, figsize=(20, 20))

sns.heatmap(df_train.corr(), ax=axs[0], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 14})
sns.heatmap(df_test.corr(), ax=axs[1], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 14})

for i in range(2):    
    axs[i].tick_params(axis='x', labelsize=14)
    axs[i].tick_params(axis='y', labelsize=14)
    
axs[0].set_title('Training Set Correlations', size=15)
axs[1].set_title('Test Set Correlations', size=15)

plt.show()
"""

In [None]:
df_corr=df_train.corr().sort_values(kind="quicksort", ascending=False, by='SalePrice').abs()
df_corr.drop(axis=1, columns=df_corr.columns.drop('SalePrice'), inplace=True)
df_corr

# Dealing with Missing Values

In [9]:
def concat_df(train_data, test_data):
    # Returns a concatenated df of training and test set on axis 0
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)

df_all = concat_df(df_train, df_test)

df_train.name = 'Training Set'
df_test.name = 'Test Set'
df_all.name = 'All Set' 

dfs = [df_train, df_test]

df_all.shape

(2919, 80)

In [10]:
#remember where to divide train and test
ntrain = df_train.shape[0]
ntest = df_test.shape[0]

In [91]:
#Dividing Target column (Y)
y_train = df_train.SalePrice.values
df_all.drop(['SalePrice'], axis=1, inplace=True)

In [11]:
def display_missing(df):
    for col in df.columns:
        print(col, df[col].isnull().sum())
    print('\n')
    
for df in dfs:
    print(format(df.name))
    display_missing(df)
    
    
    
#Check remaining missing values if any 
def display_only_missing(df):
    all_data_na = (df.isnull().sum() / len(df)) * 100
    all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
    print(missing_data.head())

Training Set
MSSubClass 0
MSZoning 0
LotFrontage 259
LotArea 0
Street 0
Alley 1369
LotShape 0
LandContour 0
Utilities 0
LotConfig 0
LandSlope 0
Neighborhood 0
Condition1 0
Condition2 0
BldgType 0
HouseStyle 0
OverallQual 0
OverallCond 0
YearBuilt 0
YearRemodAdd 0
RoofStyle 0
RoofMatl 0
Exterior1st 0
Exterior2nd 0
MasVnrType 8
MasVnrArea 8
ExterQual 0
ExterCond 0
Foundation 0
BsmtQual 37
BsmtCond 37
BsmtExposure 38
BsmtFinType1 37
BsmtFinSF1 0
BsmtFinType2 38
BsmtFinSF2 0
BsmtUnfSF 0
TotalBsmtSF 0
Heating 0
HeatingQC 0
CentralAir 0
Electrical 1
1stFlrSF 0
2ndFlrSF 0
LowQualFinSF 0
GrLivArea 0
BsmtFullBath 0
BsmtHalfBath 0
FullBath 0
HalfBath 0
BedroomAbvGr 0
KitchenAbvGr 0
KitchenQual 0
TotRmsAbvGrd 0
Functional 0
Fireplaces 0
FireplaceQu 690
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageCars 0
GarageArea 0
GarageQual 81
GarageCond 81
PavedDrive 0
WoodDeckSF 0
OpenPorchSF 0
EnclosedPorch 0
3SsnPorch 0
ScreenPorch 0
PoolArea 0
PoolQC 1453
Fence 1179
MiscFeature 1406
MiscVal 0
MoSold

In [None]:
"""### Dealing with missing values
to improve in future - may be not median of overall but 
Missing values in 'LotFrontage' feature are filled with the median LotFrontage, but using the median age of the whole data set is not a good choice. Median age of a group is much better because the new values would be more informative. Median age of Pclass groups is the best choice because of its high correlation with Age (0.408106) and Survived (0.338481) features
"""

In [15]:
"""# change NA values in test set - to median
def nan_to_mean(df):
    for col in df.columns:
        if is_numeric_dtype(col):
            df[col].fillna(value=df[col].median(), inplace=True)
            print(col, df[col].median())
            
#nan_to_mean(df_all)
"""

'# change NA values in test set - to median\ndef nan_to_mean(df):\n    for col in df.columns:\n        if is_numeric_dtype(col):\n            df[col].fillna(value=df[col].median(), inplace=True)\n            print(col, df[col].median())\n            \n#nan_to_mean(df_all)\n'

In [12]:
# fill value with None - based on data description -  - for non-Numerical (object) Columns
for col in ('PoolQC','MiscFeature','Alley','Fence','FireplaceQu','GarageType', 'GarageFinish', 'GarageQual', 'GarageCond','BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType','MSSubClass'):
    df_all[col] = df_all[col].fillna('None')
    
#fill value with '0' - based on data description - for Numerical Columns
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars','BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath','MasVnrArea'):
    df_all[col] = df_all[col].fillna(0)
    

In [13]:
# Utilities : For this categorical feature all records are "AllPub", except for one "NoSeWa" and 2 NA . Since the house with 'NoSewa' is in the training set, \
# this feature won't help in predictive modelling. We can then safely remove it.
df_all = df_all.drop(['Utilities'], axis=1)


In [14]:
display_only_missing(df_all)

             Missing Ratio
SalePrice        49.982871
LotFrontage      16.649538
MSZoning          0.137033
Functional        0.068517
SaleType          0.034258


In [16]:
"""
# !for the begining I use just median of whole Dataset!

### -> in future try to use grouped median by neighborhood
LotFrontage : Since the area of each street connected to the house property most likely have a similar area to other houses in its neighborhood , we can fill in missing values by the median LotFrontage of the neighborhood.
#Group by neighborhood and fill in missing value by the median LotFrontage of all the neighborhood
df_all["LotFrontage"] = df_all.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))
"""

df_all['LotFrontage'].fillna(value=df_all['LotFrontage'].median(), inplace=True)


In [None]:
df_all['LotFrontage']

In [17]:
display_only_missing(df_all)

             Missing Ratio
SalePrice        49.982871
MSZoning          0.137033
Functional        0.068517
SaleType          0.034258
KitchenQual       0.034258


In [18]:
# find most frequent value for MSZoning
df_all.MSZoning.mode()

0    RL
dtype: object

In [19]:
# Fill missing value in corresponding columns with most frequent value in column
for col in ('MSZoning','Electrical','KitchenQual','Exterior1st','Exterior2nd','SaleType'):
    df_all[col]=df_all[col].mode()[0]

In [20]:
display_only_missing(df_all)

            Missing Ratio
SalePrice       49.982871
Functional       0.068517


In [22]:
df_all.Functional.mode()

0    Typ
dtype: object

In [24]:
# Functional : data description says NA means typical
# BTW we can just use df_all.Functional.mode() = use most frequent value (as 'Typ' is most frequent value)
df_all["Functional"] = df_all["Functional"].fillna("Typ")

In [25]:
display_only_missing(df_all)

           Missing Ratio
SalePrice      49.982871


In [26]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 79 columns):
1stFlrSF         2919 non-null int64
2ndFlrSF         2919 non-null int64
3SsnPorch        2919 non-null int64
Alley            2919 non-null object
BedroomAbvGr     2919 non-null int64
BldgType         2919 non-null object
BsmtCond         2919 non-null object
BsmtExposure     2919 non-null object
BsmtFinSF1       2919 non-null float64
BsmtFinSF2       2919 non-null float64
BsmtFinType1     2919 non-null object
BsmtFinType2     2919 non-null object
BsmtFullBath     2919 non-null float64
BsmtHalfBath     2919 non-null float64
BsmtQual         2919 non-null object
BsmtUnfSF        2919 non-null float64
CentralAir       2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
Electrical       2919 non-null object
EnclosedPorch    2919 non-null int64
ExterCond        2919 non-null object
ExterQual        2919 non-null object
Exterior1st      29

### Seems no missed values, except SalePrice from Test DataSet
Missing Values = DONE

# Dealing with categorical values

In [36]:
def show_object_columns(df):
    for col in df:
        if is_string_dtype(df[col]):
            print(col)
show_object_columns(df_all)

Alley
BldgType
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
BsmtQual
CentralAir
Condition1
Condition2
Electrical
ExterCond
ExterQual
Exterior1st
Exterior2nd
Fence
FireplaceQu
Foundation
Functional
GarageCond
GarageFinish
GarageQual
GarageType
Heating
HeatingQC
HouseStyle
KitchenQual
LandContour
LandSlope
LotConfig
LotShape
MSSubClass
MSZoning
MasVnrType
MiscFeature
MoSold
Neighborhood
OverallCond
PavedDrive
PoolQC
RoofMatl
RoofStyle
SaleCondition
SaleType
Street
YrSold


In [34]:
#Transforming some numerical variables that are really categorical

#MSSubClass=The building class
df_all['MSSubClass'] = df_all['MSSubClass'].astype(str)


#Changing OverallCond into a categorical variable
df_all['OverallCond'] = df_all['OverallCond'].astype(str)


#Year and month sold are transformed into categorical features.
df_all['YrSold'] = df_all['YrSold'].astype(str)
df_all['MoSold'] = df_all['MoSold'].astype(str)

In [35]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 79 columns):
1stFlrSF         2919 non-null int64
2ndFlrSF         2919 non-null int64
3SsnPorch        2919 non-null int64
Alley            2919 non-null object
BedroomAbvGr     2919 non-null int64
BldgType         2919 non-null object
BsmtCond         2919 non-null object
BsmtExposure     2919 non-null object
BsmtFinSF1       2919 non-null float64
BsmtFinSF2       2919 non-null float64
BsmtFinType1     2919 non-null object
BsmtFinType2     2919 non-null object
BsmtFullBath     2919 non-null float64
BsmtHalfBath     2919 non-null float64
BsmtQual         2919 non-null object
BsmtUnfSF        2919 non-null float64
CentralAir       2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
Electrical       2919 non-null object
EnclosedPorch    2919 non-null int64
ExterCond        2919 non-null object
ExterQual        2919 non-null object
Exterior1st      29

In [None]:
def train_cats(df):
    """Change any columns of strings in a panda's dataframe to a column of
    categorical values. This applies the changes inplace.
    Parameters:
    -----------
    df: A pandas dataframe. Any columns of strings will be changed to
        categorical values.
    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a
    note the type of col2 is string
    >>> train_cats(df)
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a
    now the type of col2 is category
    """
    for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

In [None]:
#train_cats(df_all)

In [None]:
df_all.info()

In [80]:
# convert object columns to categorical
def conv_obj_to_categories(df):
    """
    Convert Object columns to Categorical
    """
    for col in df:
        if is_string_dtype(df[col]):
            df[col]=df[col].astype('category')


In [81]:
conv_obj_to_categories(df_all)

In [82]:
def show_categorical_columns(df):
    """
    Print only categorical columns Number, Name and Codes of unique values in corresponding column 
    """
    for col in df:
        if is_categorical_dtype(df[col]):
            print(sum(np.unique(df[col].cat.categories,return_counts=True)[1]), col ,df[col].cat.categories)

In [83]:
show_categorical_columns(df_all)

3 Alley Index(['Grvl', 'None', 'Pave'], dtype='object')
5 BldgType Index(['1Fam', '2fmCon', 'Duplex', 'Twnhs', 'TwnhsE'], dtype='object')
5 BsmtCond Index(['Fa', 'Gd', 'None', 'Po', 'TA'], dtype='object')
5 BsmtExposure Index(['Av', 'Gd', 'Mn', 'No', 'None'], dtype='object')
7 BsmtFinType1 Index(['ALQ', 'BLQ', 'GLQ', 'LwQ', 'None', 'Rec', 'Unf'], dtype='object')
7 BsmtFinType2 Index(['ALQ', 'BLQ', 'GLQ', 'LwQ', 'None', 'Rec', 'Unf'], dtype='object')
5 BsmtQual Index(['Ex', 'Fa', 'Gd', 'None', 'TA'], dtype='object')
2 CentralAir Index(['N', 'Y'], dtype='object')
9 Condition1 Index(['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNe',
       'RRNn'],
      dtype='object')
8 Condition2 Index(['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNn'], dtype='object')
1 Electrical Index(['SBrkr'], dtype='object')
5 ExterCond Index(['Ex', 'Fa', 'Gd', 'Po', 'TA'], dtype='object')
4 ExterQual Index(['Ex', 'Fa', 'Gd', 'TA'], dtype='object')
1 Exterior1st Index(['VinylSd']

## Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
#for col in df_all.columns:
#    if is_categorical_dtype(col):
#        print(col)

In [None]:
"""cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(df_all[c].values)) 
    df_all[c] = lbl.transform(list(df_all[c].values))

# shape        
print('Shape all_data: {}'.format(df_all.shape))
"""

In [None]:
df_all.info()

In [84]:
def unique_categories(df):
    """
    Print only categorical columns Names and Number of unique values in corresponding column 
    """
    for col in df:
        if is_categorical_dtype(df[col]):
            print(col, sum(np.unique(df[col].cat.categories,return_counts=True)[1]))
unique_categories(df_all)

Alley 3
BldgType 5
BsmtCond 5
BsmtExposure 5
BsmtFinType1 7
BsmtFinType2 7
BsmtQual 5
CentralAir 2
Condition1 9
Condition2 8
Electrical 1
ExterCond 5
ExterQual 4
Exterior1st 1
Exterior2nd 1
Fence 5
FireplaceQu 6
Foundation 6
Functional 7
GarageCond 6
GarageFinish 4
GarageQual 6
GarageType 7
Heating 6
HeatingQC 5
HouseStyle 8
KitchenQual 1
LandContour 4
LandSlope 3
LotConfig 5
LotShape 4
MSSubClass 16
MSZoning 1
MasVnrType 4
MiscFeature 5
MoSold 12
Neighborhood 25
OverallCond 9
PavedDrive 3
PoolQC 4
RoofMatl 8
RoofStyle 6
SaleCondition 6
SaleType 1
Street 2
YrSold 5


### Dummies

In [85]:
df_all=pd.get_dummies(df_all)

# Normalization

In [None]:
#Normalization, the Sigmoid, Log, Cube Root and the Hyperbolic Tangent. 
#It all depends on what one is trying to accomplish.

In [None]:
df_all.info()

# Machine Learning

In [89]:
"""Dividing working DataFrame back to Train and Test"""
# split Validational/Test set from Training set after Categorical Value Engeneering
X_valid_testset=df_all.iloc[ntrain:] # Test set
X_train=df_all.iloc[:ntrain] # Train set

In [92]:
df.shape, y_train.shape, X_valid_testset.shape, X_train.shape

((1459, 79), (1460,), (1459, 291), (1460, 291))

# Evaluation

In [93]:
from sklearn.model_selection import train_test_split

In [94]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train)

In [95]:
X_train.shape, X_valid.shape

((1095, 291), (365, 291))

In [96]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [113]:
# Evaluation of simple Random Forest
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)

CPU times: user 268 ms, sys: 4.4 ms, total: 273 ms
Wall time: 214 ms
[0.012097015715977669, 0.012182697874411728, 0.9990805832488423, 0.9990661634406787]


In [98]:
from sklearn.metrics import mean_squared_error

In [114]:

math.sqrt(mean_squared_error(y_valid, m.predict(X_valid)))

0.012182697874411572

In [100]:
# if you need to evaluate LOG Root mean squared error but wouldn't like to convert y to log(y)

In [101]:
from sklearn.metrics import mean_squared_log_error

In [102]:
math.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(m.predict(X_valid))))

0.010642917135558443

In [103]:
y_pred = m.predict(X_valid)

#mean_squared_error(y_valid, y_pred)
#def rmse1(y, y_pred):
#    return np.sqrt(np.mean(np.square(y - y_pred)))
#rmse1 (y_valid, y_pred)
print_score(m)

[0.010537460345837978, 0.010642917135558327, 0.9993023656937409, 0.9992873024170605]


### Experimenting with Random Forest

In [115]:
m = RandomForestRegressor(n_estimators=160, min_samples_leaf=1, max_features=0.5, n_jobs=-1, oob_score=True, )
m.fit(X_train, y_train)
print_score(m)

[0.013316339347677002, 0.03140497004908063, 0.9988858962647397, 0.9937944441098655, 0.9918014177704907]


In [105]:
from sklearn.preprocessing import RobustScaler

def print_score_robustscaler(m):
    res = [rmse(m.predict(Xtr_r), y_train), rmse(m.predict(Xte_r), y_valid),
                m.score(Xtr_r, y_train), m.score(Xte_r, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)
    
robust_scaler = RobustScaler()
#Xtr_r = robust_scaler.fit_transform(X_train)
Xtr_r = robust_scaler.fit_transform(X_train)
Xte_r = robust_scaler.transform(X_valid)



In [106]:
m = RandomForestRegressor(n_estimators=160, min_samples_leaf=1, max_features=0.5, n_jobs=-1, oob_score=True, )
m.fit(Xtr_r, y_train)
print_score_robustscaler(m)

[0.0145743508260406, 0.032239174237771756, 0.9986654515010421, 0.9934603915976654, 0.9907117485975234]


In [None]:
# Evaluation of XGboost
from xgboost import XGBRegressor
import re

#from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
#import xgboost as xgb

regex = re.compile(r"\[|\]|<", re.IGNORECASE)
X_train.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_train.columns.values]
X_valid.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_valid.columns.values]



m = XGBRegressor()
#%time m.fit(X_train, y_train)
#print_score(m)
m.fit(Xtr_r, y_train)
print_score_robustscaler(m)

In [None]:
m = Lasso(alpha =0.0005, random_state=1)

In [None]:
%time m.fit(X_train, y_train)
print_score(m)

In [None]:
#test

In [None]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
#import lightgbm as lgb

In [None]:
#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [None]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))


In [None]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

In [None]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)


In [None]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

In [None]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

In [None]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


In [None]:
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

# Stacking models
Simplest Stacking approach : Averaging base models

In [None]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)  

#averaged_models = AveragingModels(models = (ENet, GBoost, KRR, lasso))
averaged_models = AveragingModels(models = (ENet, GBoost, lasso))

score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

## Less simple Stacking : Adding a Meta-model¶


In [None]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [None]:
stacked_averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, KRR),
                                                 meta_model = lasso)

score = rmsle_cv(stacked_averaged_models)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

In [None]:
#def rmsle(y, y_pred):
#    return np.sqrt(mean_squared_error(y, y_pred))

In [None]:
#stacked_averaged_models.fit(train.values, y_train)
#stacked_train_pred = stacked_averaged_models.predict(train.values)
#stacked_pred = np.expm1(stacked_averaged_models.predict(test.values))
#print(rmsle(y_train, stacked_train_pred))

# Predictions for submission

In [None]:
# Remove all ,[] symbols from dataframe columns and values
X_valid_testset.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_valid_testset.columns.values]

In [None]:
#y_pred = m.predict(X_valid_testset)

In [None]:
#GBoost.fit(X_train, y_train)
#y_pred = np.expm1(GBoost.predict(X_valid_testset))

In [116]:
#averaged_models.fit(X_train, y_train)
#y_pred = np.expm1(averaged_models.predict(X_valid_testset))

NameError: name 'averaged_models' is not defined

In [128]:
y_pred = np.expm1(m.predict(X_valid_testset))

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

# Submission

In [None]:
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = y_pred
sub.to_csv('submission_03Aug19.csv',index=False)

In [None]:
sub.head()