In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from scipy import special
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score        
from sklearn.metrics import mean_squared_log_error
import optuna.integration.lightgbm as lgb
from joblib import dump, load
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import KFold

In [None]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
train['SalePrice'], SalePrice_lamb = stats.boxcox(train['SalePrice'])
labels_scaler = preprocessing.StandardScaler().fit(train[['SalePrice']])
labels = labels_scaler.transform(train[['SalePrice']]).reshape(-1)
labels = pd.Series(data=labels)

sns.distplot(labels)
fig = plt.figure()
ax = fig.add_subplot(212)
stats.probplot(labels, dist=stats.norm, plot=ax)

In [None]:
#check that our features are consistent between the Test and Train
train_test_intersection = np.intersect1d(train.columns, test.columns)
print('Similar Features')
print (train_test_intersection)
train_test_difference = np.setdiff1d(train.columns, test.columns)

print('Different Features')
print (train_test_difference)

In [None]:
train_features = pd.DataFrame()
test_features = pd.DataFrame()

In [None]:
# MSSubClass: Identifies the type of dwelling involved in the sale.	

#         20	1-STORY 1946 & NEWER ALL STYLES
#         30	1-STORY 1945 & OLDER
#         40	1-STORY W/FINISHED ATTIC ALL AGES
#         45	1-1/2 STORY - UNFINISHED ALL AGES
#         50	1-1/2 STORY FINISHED ALL AGES
#         60	2-STORY 1946 & NEWER
#         70	2-STORY 1945 & OLDER
#         75	2-1/2 STORY ALL AGES
#         80	SPLIT OR MULTI-LEVEL
#         85	SPLIT FOYER
#         90	DUPLEX - ALL STYLES AND AGES
#        120	1-STORY PUD (Planned Unit Development) - 1946 & NEWER
#        150	1-1/2 STORY PUD - ALL AGES
#        160	2-STORY PUD - 1946 & NEWER
#        180	PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
#        190	2 FAMILY CONVERSION - ALL STYLES AND AGES
column = 'MSSubClass'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# MSZoning: Identifies the general zoning classification of the sale.

#        A	Agriculture
#        C (all)	Commercial
#        FV	Floating Village Residential
#        I	Industrial
#        RH	Residential High Density
#        RL	Residential Low Density
#        RP	Residential Low Density Park 
#        RM	Residential Medium Density
column = 'MSZoning'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# LotArea: Lot size in square feet
column = 'LotArea'

fillvalue = train[column].mean()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)

# box-cox to improve normaility
train[column], lamb = stats.boxcox(train[column])
scaler = preprocessing.StandardScaler().fit(train[[column]])
train[[column]] = scaler.transform(train[[column]])
train_features = pd.concat([train_features,train[column]],axis=1)

test[column] = stats.boxcox(test[column],lamb)
test[[column]] = scaler.transform(test[[column]])
test_features = pd.concat([test_features,test[column]],axis=1)

sns.distplot(train[column])
fig = plt.figure()
ax = fig.add_subplot(212)
stats.probplot(train[column], dist=stats.norm, plot=ax)

In [None]:
# Street: Type of road access to property

#        Grvl	Gravel	
#        Pave	Paved
column = 'Street'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# Alley: Type of alley access to property

#        Grvl	Gravel
#        Pave	Paved
#        NA 	No alley access
column = 'Alley'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# LotShape: General shape of property

#        Reg	Regular	
#        IR1	Slightly irregular
#        IR2	Moderately Irregular
#        IR3	Irregular
column = 'LotShape'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# LandContour: Flatness of the property

#        Lvl	Near Flat/Level	
#        Bnk	Banked - Quick and significant rise from street grade to building
#        HLS	Hillside - Significant slope from side to side
#        Low	Depression
column = 'LandContour'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# Utilities: Type of utilities available
		
#        AllPub	All public Utilities (E,G,W,& S)	
#        NoSewr	Electricity, Gas, and Water (Septic Tank)
#        NoSeWa	Electricity and Gas Only
#        ELO	Electricity only	
column = 'Utilities'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# LotConfig: Lot configuration

#        Inside	Inside lot
#        Corner	Corner lot
#        CulDSac	Cul-de-sac
#        FR2	Frontage on 2 sides of property
#        FR3	Frontage on 3 sides of property
column = 'LotConfig'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# LandSlope: Slope of property
		
#        Gtl	Gentle slope
#        Mod	Moderate Slope	
#        Sev	Severe Slope
column = 'LandSlope'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# Neighborhood: Physical locations within Ames city limits

#        Blmngtn	Bloomington Heights
#        Blueste	Bluestem
#        BrDale	Briardale
#        BrkSide	Brookside
#        ClearCr	Clear Creek
#        CollgCr	College Creek
#        Crawfor	Crawford
#        Edwards	Edwards
#        Gilbert	Gilbert
#        IDOTRR	Iowa DOT and Rail Road
#        MeadowV	Meadow Village
#        Mitchel	Mitchell
#        Names	North Ames
#        NoRidge	Northridge
#        NPkVill	Northpark Villa
#        NridgHt	Northridge Heights
#        NWAmes	Northwest Ames
#        OldTown	Old Town
#        SWISU	South & West of Iowa State University
#        Sawyer	Sawyer
#        SawyerW	Sawyer West
#        Somerst	Somerset
#        StoneBr	Stone Brook
#        Timber	Timberland
#        Veenker	Veenker
column = 'Neighborhood'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# Condition1: Proximity to various conditions
	
#        Artery	Adjacent to arterial street
#        Feedr	Adjacent to feeder street	
#        Norm	Normal	
#        RRNn	Within 200' of North-South Railroad
#        RRAn	Adjacent to North-South Railroad
#        PosN	Near positive off-site feature--park, greenbelt, etc.
#        PosA	Adjacent to postive off-site feature
#        RRNe	Within 200' of East-West Railroad
#        RRAe	Adjacent to East-West Railroad
column = 'Condition1'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# Condition2: Proximity to various conditions (if more than one is present)
		
#        Artery	Adjacent to arterial street
#        Feedr	Adjacent to feeder street	
#        Norm	Normal	
#        RRNn	Within 200' of North-South Railroad
#        RRAn	Adjacent to North-South Railroad
#        PosN	Near positive off-site feature--park, greenbelt, etc.
#        PosA	Adjacent to postive off-site feature
#        RRNe	Within 200' of East-West Railroad
#        RRAe	Adjacent to East-West Railroad
column = 'Condition2'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# BldgType: Type of dwelling
		
#        1Fam	Single-family Detached	
#        2FmCon	Two-family Conversion; originally built as one-family dwelling
#        Duplx	Duplex
#        TwnhsE	Townhouse End Unit
#        TwnhsI	Townhouse Inside Unit
column = 'BldgType'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# HouseStyle: Style of dwelling
	
#        1Story	One story
#        1.5Fin	One and one-half story: 2nd level finished
#        1.5Unf	One and one-half story: 2nd level unfinished
#        2Story	Two story
#        2.5Fin	Two and one-half story: 2nd level finished
#        2.5Unf	Two and one-half story: 2nd level unfinished
#        SFoyer	Split Foyer
#        SLvl	Split Level
column = 'HouseStyle'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# OverallQual: Rates the overall material and finish of the house

#        10	Very Excellent
#        9	Excellent
#        8	Very Good
#        7	Good
#        6	Above Average
#        5	Average
#        4	Below Average
#        3	Fair
#        2	Poor
#        1	Very Poor
column = 'OverallQual'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# OverallCond: Rates the overall condition of the house

#        10	Very Excellent
#        9	Excellent
#        8	Very Good
#        7	Good
#        6	Above Average	
#        5	Average
#        4	Below Average	
#        3	Fair
#        2	Poor
#        1	Very Poor
column = 'OverallCond'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# RoofStyle: Type of roof

#        Flat	Flat
#        Gable	Gable
#        Gambrel	Gabrel (Barn)
#        Hip	Hip
#        Mansard	Mansard
#        Shed	Shed
column = 'RoofStyle'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# RoofMatl: Roof material

#        ClyTile	Clay or Tile
#        CompShg	Standard (Composite) Shingle
#        Membran	Membrane
#        Metal	Metal
#        Roll	Roll
#        Tar&Grv	Gravel & Tar
#        WdShake	Wood Shakes
#        WdShngl	Wood Shingles
column = 'RoofMatl'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# Exterior1st: Exterior covering on house

#        AsbShng	Asbestos Shingles
#        AsphShn	Asphalt Shingles
#        BrkComm	Brick Common
#        BrkFace	Brick Face
#        CBlock	Cinder Block
#        CemntBd	Cement Board
#        HdBoard	Hard Board
#        ImStucc	Imitation Stucco
#        MetalSd	Metal Siding
#        Other	Other
#        Plywood	Plywood
#        PreCast	PreCast	
#        Stone	Stone
#        Stucco	Stucco
#        VinylSd	Vinyl Siding
#        Wd Sdng	Wood Siding
#        WdShing	Wood Shingles
column = 'Exterior1st'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# Exterior2nd: Exterior covering on house (if more than one material)

#        AsbShng	Asbestos Shingles
#        AsphShn	Asphalt Shingles
#        BrkComm	Brick Common
#        BrkFace	Brick Face
#        CBlock	Cinder Block
#        CemntBd	Cement Board
#        HdBoard	Hard Board
#        ImStucc	Imitation Stucco
#        MetalSd	Metal Siding
#        Other	Other
#        Plywood	Plywood
#        PreCast	PreCast
#        Stone	Stone
#        Stucco	Stucco
#        VinylSd	Vinyl Siding
#        Wd Sdng	Wood Siding
#        WdShing	Wood Shingles
column = 'Exterior2nd'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# MasVnrType: Masonry veneer type

#        BrkCmn	Brick Common
#        BrkFace	Brick Face
#        CBlock	Cinder Block
#        None	None
#        Stone	Stone
column = 'MasVnrType'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# ExterQual: Evaluates the quality of the material on the exterior 
		
#        Ex	Excellent
#        Gd	Good
#        TA	Average/Typical
#        Fa	Fair
#        Po	Poor
column = 'ExterQual'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# ExterCond: Evaluates the present condition of the material on the exterior
		
#        Ex	Excellent
#        Gd	Good
#        TA	Average/Typical
#        Fa	Fair
#        Po	Poor
column = 'ExterCond'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# Foundation: Type of foundation
		
#        BrkTil	Brick & Tile
#        CBlock	Cinder Block
#        PConc	Poured Contrete	
#        Slab	Slab
#        Stone	Stone
#        Wood	Wood
column = 'Foundation'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# BsmtQual: Evaluates the height of the basement

#        Ex	Excellent (100+ inches)	
#        Gd	Good (90-99 inches)
#        TA	Typical (80-89 inches)
#        Fa	Fair (70-79 inches)
#        Po	Poor (<70 inches
#        NA	No Basement
column = 'BsmtQual'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# BsmtCond: Evaluates the general condition of the basement

#        Ex	Excellent
#        Gd	Good
#        TA	Typical - slight dampness allowed
#        Fa	Fair - dampness or some cracking or settling
#        Po	Poor - Severe cracking, settling, or wetness
#        NA	No Basement
column = 'BsmtCond'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# BsmtExposure: Refers to walkout or garden level walls

#        Gd	Good Exposure
#        Av	Average Exposure (split levels or foyers typically score average or above)	
#        Mn	Mimimum Exposure
#        No	No Exposure
#        NA	No Basement
column = 'BsmtExposure'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# BsmtFinType1: Rating of basement finished area

#        GLQ	Good Living Quarters
#        ALQ	Average Living Quarters
#        BLQ	Below Average Living Quarters	
#        Rec	Average Rec Room
#        LwQ	Low Quality
#        Unf	Unfinshed
#        NA	No Basement
column = 'BsmtFinType1'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# BsmtFinType2: Rating of basement finished area (if multiple types)

#        GLQ	Good Living Quarters
#        ALQ	Average Living Quarters
#        BLQ	Below Average Living Quarters	
#        Rec	Average Rec Room
#        LwQ	Low Quality
#        Unf	Unfinshed
#        NA	No Basement
column = 'BsmtFinType2'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# TotalBsmtSF: Total square feet of basement area
column = 'TotalBsmtSF'

fillvalue = 0.0
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)


scaler = preprocessing.StandardScaler().fit(train[[column]])
train[[column]] = scaler.transform(train[[column]])
train_features = pd.concat([train_features,train[column]],axis=1)


test[[column]] = scaler.transform(test[[column]])
test_features = pd.concat([test_features,test[column]],axis=1)

sns.distplot(train[column])
fig = plt.figure()
ax = fig.add_subplot(212)
stats.probplot(train[column], dist=stats.norm, plot=ax)

In [None]:
# Heating: Type of heating
		
#        Floor	Floor Furnace
#        GasA	Gas forced warm air furnace
#        GasW	Gas hot water or steam heat
#        Grav	Gravity furnace	
#        OthW	Hot water or steam heat other than gas
#        Wall	Wall furnace
column = 'Heating'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# HeatingQC: Heating quality and condition

#        Ex	Excellent
#        Gd	Good
#        TA	Average/Typical
#        Fa	Fair
#        Po	Poor
column = 'HeatingQC'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# CentralAir: Central air conditioning

#        N	No
#        Y	Yes
column = 'CentralAir'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# Electrical: Electrical system

#        SBrkr	Standard Circuit Breakers & Romex
#        FuseA	Fuse Box over 60 AMP and all Romex wiring (Average)	
#        FuseF	60 AMP Fuse Box and mostly Romex wiring (Fair)
#        FuseP	60 AMP Fuse Box and mostly knob & tube wiring (poor)
#        Mix	Mixed
column = 'Electrical'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# 1stFlrSF: First Floor square feet
column = '1stFlrSF'

fillvalue = train[column].mean()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)

# box-cox to improve normaility
train[column], lamb = stats.boxcox(train[column])
scaler = preprocessing.StandardScaler().fit(train[[column]])
train[[column]] = scaler.transform(train[[column]])
train_features = pd.concat([train_features,train[column]],axis=1)

test[column] = stats.boxcox(test[column],lamb)
test[[column]] = scaler.transform(test[[column]])
test_features = pd.concat([test_features,test[column]],axis=1)

sns.distplot(train[column])
fig = plt.figure()
ax = fig.add_subplot(212)
stats.probplot(train[column], dist=stats.norm, plot=ax)

In [None]:
# 2ndFlrSF: Second floor square feet
column = '2ndFlrSF'

fillvalue = 0
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)

train.loc[train[column]>0,column] = 1
train_features = pd.concat([train_features,train[column]],axis=1)

test.loc[test[column]>0,column] = 1
test_features = pd.concat([test_features,test[column]],axis=1)

In [None]:
# GrLivArea: Above grade (ground) living area square feet
column = 'GrLivArea'

fillvalue = train[column].mean()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)

# box-cox to improve normaility
train[column], lamb = stats.boxcox(train[column])
scaler = preprocessing.StandardScaler().fit(train[[column]])
train[[column]] = scaler.transform(train[[column]])
train_features = pd.concat([train_features,train[column]],axis=1)

test[column] = stats.boxcox(test[column],lamb)
test[[column]] = scaler.transform(test[[column]])
test_features = pd.concat([test_features,test[column]],axis=1)

sns.distplot(train[column])
fig = plt.figure()
ax = fig.add_subplot(212)
stats.probplot(train[column], dist=stats.norm, plot=ax)

In [None]:
# BsmtFullBath: Basement full bathrooms
column = 'BsmtFullBath'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# BsmtHalfBath: Basement half bathrooms
column = 'BsmtHalfBath'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# FullBath: Full bathrooms above grade
column = 'FullBath'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# HalfBath: Half baths above grade
column = 'HalfBath'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
#Bedroom: Bedrooms above grade (does NOT include basement bedrooms)
column = 'BedroomAbvGr'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# Kitchen: Kitchens above grade
column = 'KitchenAbvGr'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# KitchenQual: Kitchen quality

#        Ex	Excellent
#        Gd	Good
#        TA	Typical/Average
#        Fa	Fair
#        Po	Poor
column = 'KitchenQual'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
column = 'TotRmsAbvGrd'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# Functional: Home functionality (Assume typical unless deductions are warranted)

#        Typ	Typical Functionality
#        Min1	Minor Deductions 1
#        Min2	Minor Deductions 2
#        Mod	Moderate Deductions
#        Maj1	Major Deductions 1
#        Maj2	Major Deductions 2
#        Sev	Severely Damaged
#        Sal	Salvage only
column = 'Functional'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# Fireplaces: Number of fireplaces
column = 'Fireplaces'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# FireplaceQu: Fireplace quality

#        Ex	Excellent - Exceptional Masonry Fireplace
#        Gd	Good - Masonry Fireplace in main level
#        TA	Average - Prefabricated Fireplace in main living area or Masonry Fireplace in basement
#        Fa	Fair - Prefabricated Fireplace in basement
#        Po	Poor - Ben Franklin Stove
#        NA	No Fireplace
column = 'FireplaceQu'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# GarageType: Garage location
		
#        2Types	More than one type of garage
#        Attchd	Attached to home
#        Basment	Basement Garage
#        BuiltIn	Built-In (Garage part of house - typically has room above garage)
#        CarPort	Car Port
#        Detchd	Detached from home
#        NA	No Garage
column = 'GarageType'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# GarageFinish: Interior finish of the garage

#        Fin	Finished
#        RFn	Rough Finished	
#        Unf	Unfinished
#        NA	No Garage
column = 'GarageFinish'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# GarageCars: Size of garage in car capacity
column = 'GarageCars'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# GarageArea: Size of garage in square feet
column = 'GarageArea'

fillvalue = 0
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)

scaler = preprocessing.StandardScaler().fit(train[[column]])
train[[column]] = scaler.transform(train[[column]])
train_features = pd.concat([train_features,train[column]],axis=1)

test[[column]] = scaler.transform(test[[column]])
test_features = pd.concat([test_features,test[column]],axis=1)

sns.distplot(train[column])
fig = plt.figure()
ax = fig.add_subplot(212)
stats.probplot(train[column], dist=stats.norm, plot=ax)

In [None]:
# GarageQual: Garage quality

#        Ex	Excellent
#        Gd	Good
#        TA	Typical/Average
#        Fa	Fair
#        Po	Poor
#        NA	No Garage
column = 'GarageQual'

fillvalue = 'NA'
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# GarageCond: Garage condition

#        Ex	Excellent
#        Gd	Good
#        TA	Typical/Average
#        Fa	Fair
#        Po	Poor
#        NA	No Garage
column = 'GarageCond'

fillvalue = 'NA'
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# PavedDrive: Paved driveway

#        Y	Paved 
#        P	Partial Pavement
#        N	Dirt/Gravel
column = 'PavedDrive'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# WoodDeckSF: Wood deck area in square feet
column = 'WoodDeckSF'

fillvalue = 0
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)

train.loc[train[column]>0,column] = 1
train_features = pd.concat([train_features,train[column]],axis=1)

test.loc[test[column]>0,column] = 1
test_features = pd.concat([test_features,test[column]],axis=1)

In [None]:
# OpenPorchSF: Open porch area in square feet
column = 'OpenPorchSF'

fillvalue = 0
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)

train.loc[train[column]>0,column] = 1
train_features = pd.concat([train_features,train[column]],axis=1)

test.loc[test[column]>0,column] = 1
test_features = pd.concat([test_features,test[column]],axis=1)

In [None]:
# EnclosedPorch: Enclosed porch area in square feet
column = 'EnclosedPorch'

fillvalue = 0
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)

train.loc[train[column]>0,column] = 1
train_features = pd.concat([train_features,train[column]],axis=1)

test.loc[test[column]>0,column] = 1
test_features = pd.concat([test_features,test[column]],axis=1)

In [None]:
# 3SsnPorch: Three season porch area in square feet
column = '3SsnPorch'

fillvalue = 0
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)

train.loc[train[column]>0,column] = 1
train_features = pd.concat([train_features,train[column]],axis=1)

test.loc[test[column]>0,column] = 1
test_features = pd.concat([test_features,test[column]],axis=1)

In [None]:
# ScreenPorch: Screen porch area in square feet
column = 'ScreenPorch'

fillvalue = 0
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)

train.loc[train[column]>0,column] = 1
train_features = pd.concat([train_features,train[column]],axis=1)

test.loc[test[column]>0,column] = 1
test_features = pd.concat([test_features,test[column]],axis=1)

In [None]:
# # PoolQC: Pool quality
		
# #        Ex	Excellent
# #        Gd	Good
# #        TA	Average/Typical
# #        Fa	Fair
# #        NA	No Pool
column = 'PoolQC'

fillvalue = 'NA'
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# Fence: Fence quality
		
#        GdPrv	Good Privacy
#        MnPrv	Minimum Privacy
#        GdWo	Good Wood
#        MnWw	Minimum Wood/Wire
#        NA	No Fence
column = 'Fence'

fillvalue = 'NA'
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# MiscFeature: Miscellaneous feature not covered in other categories
		
#        Elev	Elevator
#        Gar2	2nd Garage (if not described in garage section)
#        Othr	Other
#        Shed	Shed (over 100 SF)
#        TenC	Tennis Court
#        NA	None
column = 'MiscFeature'

fillvalue = 'NA'
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
# # SaleType: Type of sale
		
#        WD 	Warranty Deed - Conventional
#        CWD	Warranty Deed - Cash
#        VWD	Warranty Deed - VA Loan
#        New	Home just constructed and sold
#        COD	Court Officer Deed/Estate
#        Con	Contract 15% Down payment regular terms
#        ConLw	Contract Low Down payment and low interest
#        ConLI	Contract Low Interest
#        ConLD	Contract Low Down
#        Oth	Other
column = 'SaleType'

fillvalue = train[column].mode()
train[column] = train[column].fillna(fillvalue)
test[column] = test[column].fillna(fillvalue)
        
dummies_train = pd.get_dummies(train[column], columns=[column],prefix = column)
dummies_test = pd.get_dummies(test[column], columns=[column],prefix = column)
keep = np.intersect1d(dummies_train.columns, dummies_test.columns)
drop = np.setdiff1d(dummies_train.columns, dummies_test.columns)

train_features = pd.concat([train_features ,dummies_train[keep]],axis=1)
test_features = pd.concat([test_features,dummies_test[keep]],axis=1)

print('Different Features')
print(drop)

In [None]:
train_features.columns = train_features.columns.str.replace(' ', '') 
test_features.columns = test_features.columns.str.replace(' ', '') 

In [None]:
n_fold = 5
cv = KFold(n_splits=n_fold, shuffle=True, random_state=42)

prediction = np.zeros(len(test_features))
msle, r2 = [], []

params = { "objective": "regression",
          "metric": "l2",
          "verbosity": -1,
          "boosting_type": "dart",
          "early_stopping_round": 10,
          "num_iterations": 500,
          "n_jobs": -1}

for fold_n, (train_index, valid_index) in enumerate(cv.split(train_features)):

    X_train = train_features.iloc[train_index,:]
    X_valid = train_features.iloc[valid_index,:]
    
    Y_train = labels.iloc[train_index]
    Y_valid = labels.iloc[valid_index]
                
    dtrain = lgb.Dataset(X_train, label=Y_train)
    dval = lgb.Dataset(X_valid, label=Y_valid)

    model = lgb.train( params, dtrain, valid_sets=[dval],verbose_eval=-1)
    file = 'fold'+str(fold_n)+'.pkl'
    dump(model,file)

    y_pred = model.predict(X_valid)

    Y_valid = labels_scaler.inverse_transform(Y_valid)
    Y_valid = special.inv_boxcox(Y_valid, SalePrice_lamb) 
    y_pred = labels_scaler.inverse_transform(y_pred)
    y_pred = special.inv_boxcox(y_pred, SalePrice_lamb) 
    print('MSLE: ', mean_squared_log_error(Y_valid, y_pred))
    print('R2: ', r2_score(Y_valid, y_pred))

    prediction += model.predict(test_features)
        
prediction /= n_fold

In [None]:
prediction = labels_scaler.inverse_transform(prediction)
prediction = special.inv_boxcox(prediction, SalePrice_lamb) 
submission = pd.DataFrame({
        "Id": test["Id"],
        "SalePrice": prediction
    })

submission.to_csv('submission.csv', index=False)