In [1]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4
# Pretty display for notebooks
%matplotlib inline

In [3]:
# Read data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

#check the numbers of samples and features
print("The train data size before dropping Id feature is : {} ".format(train.shape))
print("The test data size before dropping Id feature is : {} ".format(test.shape))

#Save the 'Id' column
train_ID = train['Id']
test_ID = test['Id']

#Now drop the  'Id' colum since it's unnecessary for  the prediction process.
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)

#check again the data size after dropping the 'Id' variable
print("\nThe train data size after dropping Id feature is : {} ".format(train.shape)) 
print("The test data size after dropping Id feature is : {} ".format(test.shape))

The train data size before dropping Id feature is : (1460, 81) 
The test data size before dropping Id feature is : (1459, 80) 

The train data size after dropping Id feature is : (1460, 80) 
The test data size after dropping Id feature is : (1459, 79) 


In [4]:
data = train

In [5]:
# 40% cutoff point
cutoff = len(data) * 2 // 5
cutoff

584

In [6]:
features_drop = []
for col in data.columns.values:
    if data[col].isna().sum() > cutoff:
        data.drop(col, axis=1, inplace=True)
        features_drop.append(col)

In [7]:
print("features drop:", features_drop)

features drop: ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']


In [8]:
labels = data.columns

In [9]:
numeric = []
categoric = []
for col in labels:
    if data[col].dtype == 'object':
        categoric.append(col)
    else:
        numeric.append(col)
numeric.remove('SalePrice')

# Move outliers to other dataframe

In [10]:
temp = data[numeric].copy()
mean_ = temp.mean(axis=0)
std_ = temp.std(axis=0)
temp = (temp.sub(mean_, axis=1)).div(std_, axis=1)

In [11]:
# criteria: nromalize data and pick the ones that are mroe than 3 std away from mean
dataOut = pd.DataFrame(columns=labels)
for i in temp.index.values:
    for col in temp.columns.values:
        if abs(temp.loc[i, col]) > 3:
            dataOut = dataOut.append(data.loc[i])
            data.drop(i, inplace=True)
            break

# Transform categorical features in dummy variables

In [12]:
# normal
data = data.reindex(sorted(data.columns), axis=1)
for category in categoric:
    series = data[category]
    dummies = pd.get_dummies(series, prefix=category)
    data = pd.concat([data, dummies], axis=1)
data.drop(categoric, axis=1, inplace=True)

# outlier
dataOut = dataOut.reindex(sorted(dataOut.columns), axis=1)
for category in categoric:
    series = dataOut[category]
    dummies = pd.get_dummies(series, prefix=category)
    dataOut = pd.concat([dataOut, dummies], axis=1)
dataOut.drop(categoric, axis=1, inplace=True)

# Fill na in numeric categories

In [13]:
# Normal
for num in numeric:
    if data[num].isna().sum() == 0:
        continue
    else:
        data[num].fillna(data[num].median(), inplace=True)
        
# Outlier
for num in numeric:
    if dataOut[num].isna().sum() == 0:
        continue
    else:
        dataOut[num].fillna(dataOut[num].median(), inplace=True)

In [14]:
X = data
Xout = dataOut

# Test data

In [16]:
test_data = test.copy()
test_data.drop(features_drop, axis=1, inplace=True)
test_data = test_data.reindex(sorted(test_data.columns), axis=1)

In [17]:
temp = test_data[numeric].copy()
tempTest = (temp.sub(mean_, axis=1)).div(std_, axis=1)

In [18]:
labels = labels.drop('SalePrice')

In [19]:
testOut = pd.DataFrame(columns=labels)
for i in tempTest.index.values:
    for col in tempTest.columns.values:
        if abs(tempTest.loc[i, col]) > 4:
            testOut = testOut.append(test_data.loc[i])
            test_data.drop(i, inplace=True)
            break

In [20]:
# Normal
for category in categoric:
    series = test_data[category]
    dummies = pd.get_dummies(series, prefix=category)
    test_data = pd.concat([test_data, dummies], axis=1)
    
# Outliers
for category in categoric:
    series = testOut[category]
    dummies = pd.get_dummies(series, prefix=category)
    testOut = pd.concat([testOut, dummies], axis=1)

In [21]:
for num in numeric:
    if test_data[num].isna().sum() == 0:
        continue
    else:
        test_data[num].fillna(test_data[num].median(), inplace=True)
        
for num in numeric:
    if testOut[num].isna().sum() == 0:
        continue
    else:
        testOut[num].fillna(testOut[num].median(), inplace=True)

In [22]:
y = data['SalePrice']
yOut = dataOut['SalePrice']

In [23]:
test_data.drop(categoric, axis=1, inplace=True)
testOut.drop(categoric, axis=1, inplace=True)

In [26]:
true_features = test_data.columns.values.tolist()

In [28]:
dropped = ['Condition2_Artery', 'Condition2_PosA', 'Condition2_PosN', 'RoofStyle_Shed',
           'ExterCond_Ex', 'ExterCond_Po', 'HeatingQC_Po', 'GarageType_2Types']
for f in dropped:
    true_features.remove(f)
    
X = X[true_features]
test_data.drop(dropped, axis=1, inplace=True)


In [31]:
true_features_out = testOut.columns.values.tolist()

In [33]:
dropped = ['LotConfig_FR3', 'Exterior2nd_CBlock']

for f in dropped:
    true_features_out.remove(f)
    
Xout = Xout[true_features_out]
testOut.drop(dropped, axis=1, inplace=True)


In [34]:
Xtest = test_data
XtestOut = testOut

In [35]:
OutInd = XtestOut.index
NormInd = Xtest.index

# GBR

In [36]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

In [37]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [38]:
best_trees = 0
score = 0
for i in [250, 300, 350, 400, 450, 500]:
    clf = ensemble.GradientBoostingRegressor(n_estimators = i, max_depth = 3, min_samples_split = 2,
                                             learning_rate = 0.1, loss = 'ls')
    clf.fit(X_train, y_train)
    temp = clf.score(X_val, y_val)
    print("score for trees " + str(i) +" is: " + str(temp))
    if temp > score:
        best_trees = i
        score = temp

score for trees 250 is: 0.8875065929802199
score for trees 300 is: 0.8877090180717583
score for trees 350 is: 0.8872412886610966
score for trees 400 is: 0.8868678944179371
score for trees 450 is: 0.8886270967033363
score for trees 500 is: 0.8889393915681604


In [39]:
best_depth = 0
score = 0
for i in [2, 3, 4, 5, 6]:
    clf = ensemble.GradientBoostingRegressor(n_estimators = best_trees, max_depth = i, min_samples_split = 2,
                                             learning_rate = 0.1, loss = 'ls')
    clf.fit(X_train, y_train)
    temp = clf.score(X_val, y_val)
    print("score for depth " + str(i) +" is: " + str(temp))
    if temp > score:
        best_depth = i
        score = temp

score for depth 2 is: 0.8918914337090351
score for depth 3 is: 0.8882955361645992
score for depth 4 is: 0.8913518790072203
score for depth 5 is: 0.8880782642958602
score for depth 6 is: 0.8820401490579778


In [40]:
best_sample_split = 0
score = 0
for i in [2, 3, 4, 5, 6, 7]:
    clf = ensemble.GradientBoostingRegressor(n_estimators = best_trees, max_depth = best_depth, min_samples_split = i,
                                             learning_rate = 0.1, loss = 'ls')
    clf.fit(X_train, y_train)
    temp = clf.score(X_val, y_val)
    print("score for min sample split " + str(i) +" is: " + str(temp))
    if temp > score:
        best_sample_split = i
        score = temp

score for min sample split 2 is: 0.8918049122829474
score for min sample split 3 is: 0.8913629951084051
score for min sample split 4 is: 0.888311996396773
score for min sample split 5 is: 0.8904064226957366
score for min sample split 6 is: 0.890855821000386
score for min sample split 7 is: 0.8896438336254056


In [41]:
print("Best number of estimators:", best_trees)
print("Best depth:", best_depth)
print("Best min sample split:", best_sample_split)

Best number of estimators: 500
Best depth: 2
Best min sample split: 2


In [42]:
clf = ensemble.GradientBoostingRegressor(n_estimators = best_trees, max_depth = best_depth, 
                                         min_samples_split = best_sample_split,
                                         learning_rate = 0.1, loss = 'ls')
clf.fit(X, y)
y_pred = clf.predict(Xtest)
y_predS = pd.Series(y_pred, index=NormInd)

# RIDGE

In [43]:
from sklearn import linear_model

In [50]:
lcv = linear_model.LassoCV(alphas=
                           (27,28,29,30,31,32,33,35), cv=5, max_iter=15000, normalize = True)
lcv.fit(X_train, y_train)
print ("Alpha Value: ", (lcv.alpha_))
lcv.score(X_val, y_val)

Alpha Value:  30


0.9101807859142227

In [51]:
rcv = linear_model.RidgeCV(alphas=(3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18),
                           cv=5)
rcv.fit(X_train, y_train)
print ("Alpha Value: ", (rcv.alpha_))
rcv.score(X_val, y_val)

Alpha Value:  15


0.9063779589096186

In [56]:
lcv = linear_model.LassoCV(alphas=(14,18,19,21,22,20,25,27,28,29,30,31,32,33),
                           cv=5,normalize = True)
lcv.fit(X, y)
print ("Alpha Value: ", (lcv.alpha_))

Alpha Value:  19


In [57]:
y_predR = lcv.predict(Xtest)
y_predRS = pd.Series(y_predR, index=NormInd)

# Outliers

# GBR

In [58]:
X_train, X_val, y_train, y_val = train_test_split(Xout, yOut, test_size=0.2)

In [59]:
best_trees = 0
score = 0
for i in [250, 300, 350, 400, 450, 500]:
    clf = ensemble.GradientBoostingRegressor(n_estimators = i, max_depth = 3, min_samples_split = 2,
                                             learning_rate = 0.1, loss = 'ls')
    clf.fit(X_train, y_train)
    temp = clf.score(X_val, y_val)
    print("score for trees " + str(i) +" is: " + str(temp))
    if temp > score:
        best_trees = i
        score = temp

score for trees 250 is: 0.9095468247227835
score for trees 300 is: 0.9195711969288526
score for trees 350 is: 0.9187735038875045
score for trees 400 is: 0.9038613843456837
score for trees 450 is: 0.9218866928812638
score for trees 500 is: 0.9170296732617242


In [60]:
best_depth = 0
score = 0
for i in [2, 3, 4, 5, 6]:
    clf = ensemble.GradientBoostingRegressor(n_estimators = best_trees, max_depth = i, min_samples_split = 2,
                                             learning_rate = 0.1, loss = 'ls')
    clf.fit(X_train, y_train)
    temp = clf.score(X_val, y_val)
    print("score for depth " + str(i) +" is: " + str(temp))
    if temp > score:
        best_depth = i
        score = temp

score for depth 2 is: 0.9092072719327745
score for depth 3 is: 0.9203562279339116
score for depth 4 is: 0.9073840654182316
score for depth 5 is: 0.9046124696555673
score for depth 6 is: 0.8964740007494987


In [61]:
best_sample_split = 0
score = 0
for i in [2, 3, 4, 5, 6, 7]:
    clf = ensemble.GradientBoostingRegressor(n_estimators = best_trees, max_depth = best_depth, min_samples_split = i,
                                             learning_rate = 0.1, loss = 'ls')
    clf.fit(X_train, y_train)
    temp = clf.score(X_val, y_val)
    print("score for min sample split " + str(i) +" is: " + str(temp))
    if temp > score:
        best_sample_split = i
        score = temp

score for min sample split 2 is: 0.9148792513798673
score for min sample split 3 is: 0.9262278296276024
score for min sample split 4 is: 0.9285791267934222
score for min sample split 5 is: 0.9252843296215632
score for min sample split 6 is: 0.9267992064246494
score for min sample split 7 is: 0.9181407047931781


In [62]:
print("Best number of estimators:", best_trees)
print("Best depth:", best_depth)
print("Best min sample split:", best_sample_split)

Best number of estimators: 450
Best depth: 3
Best min sample split: 4


In [63]:
clf = ensemble.GradientBoostingRegressor(n_estimators = best_trees, max_depth = best_depth, 
                                         min_samples_split = best_sample_split,
                                         learning_rate = 0.1, loss = 'ls')
clf.fit(Xout, yOut)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=4, min_weight_fraction_leaf=0.0,
             n_estimators=450, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [64]:
y_pred_out = clf.predict(XtestOut)

In [65]:
y_pred_outS = pd.Series(y_pred_out, index=OutInd)

# RIDGE

In [67]:
from sklearn import linear_model

In [68]:
lcv = linear_model.LassoCV(alphas=
                           (10,11,12,13,14,15,16,), cv=5, max_iter=15000, normalize = True)
lcv.fit(X_train, y_train)
print ("Alpha Value: ", (lcv.alpha_))
lcv.score(X_val, y_val)

Alpha Value:  16


0.7835499037788914

In [69]:
rcv = linear_model.RidgeCV(alphas=(3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18),
                           cv=5)
rcv.fit(X_train, y_train)
print ("Alpha Value: ", (rcv.alpha_))
rcv.score(X_val, y_val)

Alpha Value:  5




0.7861113953864316

In [64]:
rcv = linear_model.RidgeCV(alphas=(3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18),
                           cv=5)
rcv.fit(Xout, yOut)
print ("Alpha Value: ", (rcv.alpha_))

Alpha Value:  10


In [65]:
y_predR_out = rcv.predict(XtestOut)
y_predR_outS = pd.Series(y_predR_out, index=OutInd)

In [70]:
yPredCombOut = y_pred_outS # * 1/2 + 1/2 * y_predR_outS

In [71]:
yPredComb = 1/2 * y_predS + 1/2 * y_predRS

In [72]:
yPred = yPredComb.append(yPredCombOut)

# SUBMIT

In [73]:
# create submission file
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = yPred
sub.to_csv('submission.csv',index=False)