In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# machine learning
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, accuracy_score


import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load .csv files
data = pd.read_csv('train.csv')

In [3]:
data.keys()

Index([u'Id', u'MSSubClass', u'MSZoning', u'LotFrontage', u'LotArea',
       u'Street', u'Alley', u'LotShape', u'LandContour', u'Utilities',
       u'LotConfig', u'LandSlope', u'Neighborhood', u'Condition1',
       u'Condition2', u'BldgType', u'HouseStyle', u'OverallQual',
       u'OverallCond', u'YearBuilt', u'YearRemodAdd', u'RoofStyle',
       u'RoofMatl', u'Exterior1st', u'Exterior2nd', u'MasVnrType',
       u'MasVnrArea', u'ExterQual', u'ExterCond', u'Foundation', u'BsmtQual',
       u'BsmtCond', u'BsmtExposure', u'BsmtFinType1', u'BsmtFinSF1',
       u'BsmtFinType2', u'BsmtFinSF2', u'BsmtUnfSF', u'TotalBsmtSF',
       u'Heating', u'HeatingQC', u'CentralAir', u'Electrical', u'1stFlrSF',
       u'2ndFlrSF', u'LowQualFinSF', u'GrLivArea', u'BsmtFullBath',
       u'BsmtHalfBath', u'FullBath', u'HalfBath', u'BedroomAbvGr',
       u'KitchenAbvGr', u'KitchenQual', u'TotRmsAbvGrd', u'Functional',
       u'Fireplaces', u'FireplaceQu', u'GarageType', u'GarageYrBlt',
       u'GarageFinish',

In [3]:
#missing data
total = data.isnull().sum().sort_values(ascending=False)
percent = 100*(data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
PoolQC,1453,99.520548
MiscFeature,1406,96.30137
Alley,1369,93.767123
Fence,1179,80.753425
FireplaceQu,690,47.260274
LotFrontage,259,17.739726
GarageCond,81,5.547945
GarageType,81,5.547945
GarageYrBlt,81,5.547945
GarageFinish,81,5.547945


In [4]:
# There are a few houses with more than 4000 sq ft living area that are
# outliers, so we drop them from the dataset
data=data[data["GrLivArea"] < 4000]

In [5]:
data_new=data.drop(["Id","PoolQC","MiscVal","MiscFeature","Fence","FireplaceQu","LotFrontage",
                 "Alley","GarageYrBlt"], axis=1)

In [6]:
#create simple imputer and use it to fill nan values 
from sklearn.preprocessing import Imputer
median_imputer = Imputer(strategy='median')
data_new['MasVnrArea'] = median_imputer.fit_transform(data_new['MasVnrArea'].reshape(-1, 1))
#missing values of numerical columns
total = data_new.isnull().sum().sort_values(ascending=False)


In [7]:
#categorical features mapping
obj_df = data_new.select_dtypes(include=['object']).copy()
for i in obj_df:
    obj_df[i] = obj_df[i].astype('category')

for i in obj_df:
    obj_df[i] = obj_df[i].cat.codes
#export numerical features
numerical_features = data_new.select_dtypes(include=["float","int","bool"]).copy()
#concat to new dataframe
dataset=pd.concat([numerical_features,obj_df], axis=1)


In [56]:
# Splitting up a training and test (validation) set
X = dataset.drop("SalePrice", axis=1)
y= dataset["SalePrice"]
frac_test = 0.25
#frac_test_2 = 0.5
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = frac_test, random_state=5)
#x_train2, x_test, y_train2, y_test = train_test_split(x_2, y_2, test_size = frac_test_2, random_state=23)

print('Full data size:')
print(dataset.shape, data['SalePrice'].shape)
print('\nTraining data size:')
print(X_train.shape, y_train.shape)
print('\nTest data size:')
print(X_test.shape, y_test.shape)

Full data size:
((1456, 72), (1456,))

Training data size:
((1092, 71), (1092,))

Test data size:
((364, 71), (364,))


In [57]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()  # define scaler
scaler.fit(X_train)  # fit scaler ONLY on the training data

# print('mean: {}\nstd:  {}'.format(scaler.mean_ , scaler.scale_))

# transform on both sets:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
# # #numpy to pd
# X_train_scaled=pd.DataFrame(X_train_scaled)
# X_train_scaled.columns=X.columns
# # X_train_scaled.shape
# X_test_scaled=pd.DataFrame(X_test_scaled)
# X_test_scaled.columns=X.columns
# # X_test_scaled.shape
# y_train=pd.DataFrame(y_train)
# y_train.columns=y_train.columns
# y_test=pd.DataFrame(y_test)
# y_test.columns=y_test.columns
# # #export to csv normalized dataset
# X_train_scaled.to_csv('X_train_scaled.csv', sep=';',index=False)
# X_test_scaled.to_csv('X_test_scaled.csv', sep=';',index=False)
# y_train.to_csv('y_train.csv', sep=';',index=False)
# y_test.to_csv('y_test.csv', sep=';',index=False)

In [58]:
#calculate correlation of features with Sales price
corr_concat = pd.concat([X_train,y_train], axis=1)
corrmat = corr_concat.corr()
corr_list = corrmat['SalePrice'].sort_values(axis=0,ascending=False).iloc[1:]
# features with correlation >0.45 and <(-0.45)
feat=corr_list[((corr_list.values >0.45)|(corr_list.values < (-0.4))) ].index.tolist()
#remove columns of the remaining ones with low correlation among them
remove_list = ['1stFlrSF','GarageArea','TotRmsAbvGrd','YearRemodAdd','OverallQual']
feat=[ x for x in feat if x not in remove_list ]


In [59]:
# Splitting up again a training and test (validation) set
X = dataset[feat]
y= dataset["SalePrice"]
frac_test = 0.25
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = frac_test, random_state=5)

print('Full data size:')
print(dataset[feat].shape, data['SalePrice'].shape)
print('\nTraining data size:')
print(X_train.shape, y_train.shape)
print('\nTest data size:')
print(X_test.shape, y_test.shape)

Full data size:
((1456, 10), (1456,))

Training data size:
((1092, 10), (1092,))

Test data size:
((364, 10), (364,))


In [15]:
import math

#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_pred, y_test) :
   assert len(y_test) == len(y_pred)
   return np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_test))**2))

In [63]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from math import sqrt
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor

In [64]:
# Training different models.

#LR
reg =linear_model.LinearRegression()
reg.fit(X_train, y_train)
pred_LR = reg.predict(X_test)
print("LR:", rmsle(pred_LR,y_test))

# # Neural Net
nn = MLPRegressor(hidden_layer_sizes=(10,),  activation='relu', solver='adam',    alpha=0.001,batch_size='auto',
               learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True,
               random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9,
               nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
               epsilon=1e-08)
nn.fit(X_train, y_train)
pred_nn = nn.predict(X_test)
print("NN:", rmsle(pred_nn,y_test))

# KNN
knn = KNeighborsRegressor(n_neighbors=2)
knn.fit(X_train,y_train)
predictions_KNN = knn.predict(X_test)
print("KNN:", rmsle(predictions_KNN,y_test))


('LR:', 0.17088235037245464)
('NN:', 0.22265437368701754)
('KNN:', 0.2138175503380746)


In [65]:
# Random Forest
rf = RandomForestRegressor(n_jobs=1, random_state=0,n_estimators=500, max_features=0.01, max_depth=11)
rf.fit(X_train,y_train)
predictions_RF = rf.predict(X_test)
print("Random Forest:", rmsle(y_test,predictions_RF))


('Random Forest:', 0.1565453755921989)


In [66]:
# Decision Tree
decision_tree = DecisionTreeRegressor(random_state=21)
decision_tree.fit(X_train, y_train)
predictions_DT = decision_tree.predict(X_test)
print("Decision Tree:", rmsle(predictions_DT,y_test))


('Decision Tree:', 0.2239369096407867)


In [67]:
# SVM
clf = SVR(C=1.0, epsilon=0.2)
clf.fit(X_train, y_train)
pred_SVM = clf.predict(X_test)
print("SVR:", rmsle(y_test,pred_SVM))

('SVR:', 0.36929026623622496)


In [21]:
import datetime
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
import time
from sklearn import preprocessing
#from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.linear_model import Ridge, LassoCV,LassoLarsCV, ElasticNet
from sklearn.kernel_ridge import KernelRidge
from scipy.stats import skew


In [68]:
def model_extra_trees_regression(Xtrain,Xtest,ytrain,y_test):
    
    X_train = Xtrain
    y_train = ytrain
    
    etr = ExtraTreesRegressor(n_jobs=1, random_state=77)
    param_grid = {'max_features': [0.01,1,5]}
    model = GridSearchCV(estimator=etr, param_grid=param_grid, n_jobs=1, cv=10)
    model.fit(X_train, y_train)
    print('Extra trees regression...')
    print('Best Params:')
    print(model.best_params_)
    #print('Best CV Score:')
    #print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return  rmsle(y_pred,y_test)


In [69]:
model_extra_trees_regression(X_train,X_test,y_train,y_test)

Extra trees regression...
Best Params:
{'max_features': 0.01}


0.17131055833716277

In [70]:
#Tuning tree-specific parameters
def model_extra_trees_regression(Xtrain,Xtest,ytrain,y_test):
    
    X_train = Xtrain
    y_train = ytrain
    
    etr = ExtraTreesRegressor(n_jobs=1, random_state=77,max_features=0.01)
    param_grid = {'n_estimators': range(20,350,30)}
    model = GridSearchCV(estimator=etr, param_grid=param_grid, n_jobs=1, cv=10)
    model.fit(X_train, y_train)
    print('Extra trees regression...')
    print('Best Params:')
    print(model.best_params_)
    #print('Best CV Score:')
    #print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return  rmsle(y_pred,y_test)

In [71]:
model_extra_trees_regression(X_train,X_test,y_train,y_test)

Extra trees regression...
Best Params:
{'n_estimators': 230}


0.16048093178048187

In [72]:
#Third Tuning parameters
def model_extra_trees_regression(Xtrain,Xtest,ytrain,y_test):
    
    X_train = Xtrain
    y_train = ytrain
    
    etr = ExtraTreesRegressor(n_jobs=1, random_state=77,max_features=5,n_estimators=230)
    param_grid = {'min_samples_leaf':range(1,20,5), 'max_features':range(5,10,2)}
    model = GridSearchCV(estimator=etr, param_grid=param_grid, n_jobs=1, cv=10)
    model.fit(X_train, y_train)
    print('Extra trees regression...')
    print('Best Params:')
    print(model.best_params_)
    #print('Best CV Score:')
    #print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return  rmsle(y_pred,y_test)

In [73]:
model_extra_trees_regression(X_train,X_test,y_train,y_test)

Extra trees regression...
Best Params:
{'max_features': 5, 'min_samples_leaf': 1}


0.16201853727635424

In [28]:
#Random Forest
min_samples_split = 0.01  #This should be ~0.5-1% of total values. 
min_samples_leaf = 50 #Can be selected based on intuition. This is just used for preventing overfitting 
max_depth = 3
max_features = 'sqrt' #Its a general thumb-rule to start with square root.
def random_forest(Xtrain,Xtest,ytrain,y_test):
    
    X_train = Xtrain
    y_train = ytrain
    
    param_grid = {'n_estimators': range(20,501,10)}
    model = GridSearchCV(estimator = RandomForestRegressor(min_samples_split=min_samples_split, 
                                                              min_samples_leaf=min_samples_leaf,
                                                              max_depth=max_depth,
                                                              max_features=max_features,
                                                              random_state=5), 
                                                              param_grid = param_grid, n_jobs=4,iid=False,cv=5)
    model.fit(X_train, y_train)
    print('Random Forest...')
    print('Best Params:')
    print(model.best_params_)
#     print('Best CV Score:')
#     print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return  rmsle(y_pred,y_test)

In [29]:
random_forest(X_train,X_test,y_train,y_test)

Random Forest...
Best Params:
{'n_estimators': 270}


0.19561021054762717

In [30]:
#Tuning tree-specific parameters
def random_forest(Xtrain,Xtest,ytrain,y_test):
    
    X_train = Xtrain
    y_train = ytrain
    
    param_grid = {'max_depth': range(1,16,2), 'min_samples_split': np.arange(0.001, 0.011, 0.001)}
    model = GridSearchCV(estimator = RandomForestRegressor(min_samples_split=min_samples_split, 
                                                              min_samples_leaf=min_samples_leaf,
                                                              max_depth=max_depth,
                                                              max_features=max_features,
                                                              random_state=5,n_estimators=270), 
                                                              param_grid = param_grid, n_jobs=4,iid=False,cv=5)
    model.fit(X_train, y_train)
    print('Random Forest...')
    print('Best Params:')
    print(model.best_params_)
#     print('Best CV Score:')
#     print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return  rmsle(y_pred,y_test)

In [31]:
random_forest(X_train,X_test,y_train,y_test)

Random Forest...
Best Params:
{'min_samples_split': 0.001, 'max_depth': 7}


0.1816754848880955

In [32]:
#Third Tuning
def random_forest(Xtrain,Xtest,ytrain,y_test):
    
    X_train = Xtrain
    y_train = ytrain
    
    param_grid = {'min_samples_leaf':range(10,101,10), 'max_features':range(1,10,2)}
    model = GridSearchCV(estimator = RandomForestRegressor(min_samples_split=0.001, 
                                                              min_samples_leaf=min_samples_leaf,
                                                              max_depth=7,
                                                              max_features=max_features,
                                                              random_state=5,n_estimators=270), 
                                                              param_grid = param_grid, n_jobs=4,iid=False,cv=5)
    model.fit(X_train, y_train)
    print('Random Forest...')
    print('Best Params:')
    print(model.best_params_)
#     print('Best CV Score:')
#     print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return  rmsle(y_pred,y_test)

In [33]:
random_forest(X_train,X_test,y_train,y_test)

Random Forest...
Best Params:
{'max_features': 7, 'min_samples_leaf': 10}


0.15693177119790136

In [38]:
#Boosting
min_samples_split = 0.01  #This should be ~0.5-1% of total values. 
min_samples_leaf = 50 #Can be selected based on intuition. This is just used for preventing overfitting 
max_depth = 3
max_features = 'sqrt' #Its a general thumb-rule to start with square root.
subsample = 0.8 #This is a commonly used used start value
learning_rate=0.3

def gboosting(Xtrain,Xtest,ytrain,y_test):
    
    X_train = Xtrain
    y_train = ytrain
    
    param_test1 = {'n_estimators': range(20,501,10)}
    model = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=learning_rate, min_samples_split=min_samples_split, 
                                                              min_samples_leaf=min_samples_leaf,max_depth=max_depth,
                                                              max_features=max_features,
                                                              subsample=subsample,random_state=10), 
                                                              param_grid = param_test1, n_jobs=4,iid=False,cv=5)
    model.fit(X_train, y_train)
    print('Gradient Boosting...')
    print('Best Params:')
    print(model.best_params_)
#     print('Best CV Score:')
#     print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return  rmsle(y_pred,y_test)

In [39]:
gboosting(X_train,X_test,y_train,y_test)

Gradient Boosting...
Best Params:
{'n_estimators': 120}


0.15456727681485227

In [40]:
def gboosting(Xtrain,Xtest,ytrain,y_test):
    
    X_train = Xtrain
    y_train = ytrain
    
    param_test2 = {'max_depth': range(1,16,2), 'min_samples_split': np.arange(0.001, 0.011, 0.001)}
    model = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=learning_rate, min_samples_split=min_samples_split, 
                                                              min_samples_leaf=min_samples_leaf,max_depth=max_depth,
                                                              max_features=max_features,
                                                              subsample=subsample,random_state=10,n_estimators=120), 
                                                              param_grid = param_test2, n_jobs=4,iid=False,cv=5)
    model.fit(X_train, y_train)
    print('Gradient Boosting...')
    print('Best Params:')
    print(model.best_params_)
#     print('Best CV Score:')
#     print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return  rmsle(y_pred,y_test)

In [41]:
gboosting(X_train,X_test,y_train,y_test)

Gradient Boosting...
Best Params:
{'min_samples_split': 0.001, 'max_depth': 3}


0.15456727681485227

In [54]:
def gboosting(Xtrain,Xtest,ytrain,y_test):
    
    X_train = Xtrain
    y_train = ytrain
    
    param_test3 = {'min_samples_leaf':range(10,101,10), 'max_features':range(1,10,1)}
    model = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=learning_rate, min_samples_split=0.001, 
                                                              min_samples_leaf=min_samples_leaf,max_depth=3,
                                                              max_features=max_features,
                                                              subsample=subsample,random_state=10,n_estimators=180), 
                                                              param_grid = param_test3, n_jobs=4,iid=False,cv=5)
    model.fit(X_train, y_train)
    print('Gradient Boosting...')
    print('Best Params:')
    print(model.best_params_)
#     print('Best CV Score:')
#     print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return  rmsle(y_pred,y_test)

In [55]:
gboosting(X_train,X_test,y_train,y_test)

Gradient Boosting...
Best Params:
{'max_features': 4, 'min_samples_leaf': 20}


0.16062335166682012

In [34]:
from sklearn import linear_model
def ridge(Xtrain,Xtest,ytrain,y_test):
    
    X_train = Xtrain
    y_train = ytrain
    
    param_test3 = {'alpha': 10**np.random.uniform(-1,1, size=100)}
    model = GridSearchCV(estimator = linear_model.Ridge(),param_grid = param_test3, n_jobs=4,iid=False,cv=5)
    model.fit(X_train, y_train)
    print('ridge...')
    print('Best Params:')
    print(model.best_params_)
#     print('Best CV Score:')
#     print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return  rmsle(y_pred,y_test)


In [35]:
ridge(X_train,X_test,y_train,y_test)

ridge...
Best Params:
{'alpha': 9.765349084760425}


0.1723350550200072

In [36]:
from sklearn.linear_model import Lasso
def lasso(Xtrain,Xtest,ytrain,y_test):
    
    X_train = Xtrain
    y_train = ytrain
    
    param_test3 = {'alpha': 10**np.random.uniform(-6,1,size=100)}
    model = GridSearchCV(estimator = Lasso(),param_grid = param_test3, n_jobs=4,iid=False,cv=5)
    model.fit(X_train, y_train)
    print('lasso...')
    print('Best Params:')
    print(model.best_params_)
#     print('Best CV Score:')
#     print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return  rmsle(y_pred,y_test)

In [37]:
lasso(X_train,X_test,y_train,y_test)

lasso...
Best Params:
{'alpha': 5.905370906977447}


0.17282906835493972