In [1]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4
# Pretty display for notebooks
%matplotlib inline

In [2]:
# Read data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

#check the numbers of samples and features
print("The train data size before dropping Id feature is : {} ".format(train.shape))
print("The test data size before dropping Id feature is : {} ".format(test.shape))

#Save the 'Id' column
train_ID = train['ID']
test_ID = test['ID']

#Now drop the  'Id' colum since it's unnecessary for  the prediction process.
train.drop("ID", axis = 1, inplace = True)
test.drop("ID", axis = 1, inplace = True)

#check again the data size after dropping the 'Id' variable
print("\nThe train data size after dropping Id feature is : {} ".format(train.shape)) 
print("The test data size after dropping Id feature is : {} ".format(test.shape))

The train data size before dropping Id feature is : (2051, 81) 
The test data size before dropping Id feature is : (879, 80) 

The train data size after dropping Id feature is : (2051, 80) 
The test data size after dropping Id feature is : (879, 79) 


In [3]:
dataDum = train.copy()

In [4]:
# 40% cutoff point
cutoff = len(dataDum) * 2 // 5
cutoff

820

In [5]:
features_drop = []
features_filled_std = {}
for col in dataDum.columns.values:
    if dataDum[col].isna().sum() > cutoff:
        dataDum.drop(col, axis=1, inplace=True)
        features_drop.append(col)

In [6]:
print("features drop:", features_drop)

features drop: ['FenceQuality', 'Misc Feature', 'FireplaceQuality', 'PoolQuality', 'TypeOfAlleyAccess']


In [7]:
labels = dataDum.columns

In [8]:
numeric = []
categoric = []
for col in labels:
    if dataDum[col].dtype == 'object':
        categoric.append(col)
    else:
        numeric.append(col)
numeric.remove('SalePrice')

# Move outliers to other dataframe

In [9]:
temp = dataDum[numeric].copy()
mean_ = temp.mean(axis=0)
std_ = temp.std(axis=0)
temp = (temp.sub(mean_, axis=1)).div(std_, axis=1)

In [10]:
dataOut = pd.DataFrame(columns=labels)
for i in temp.index.values:
    for col in temp.columns.values:
        if abs(temp.loc[i, col]) > 3:
            dataOut = dataOut.append(dataDum.loc[i])
            dataDum.drop(i, inplace=True)
            break

# Transform categorical features in dummy variables

In [11]:
# normal
dataDum = dataDum.reindex(sorted(dataDum.columns), axis=1)
for category in categoric:
    series = dataDum[category]
    dummies = pd.get_dummies(series, prefix=category)
    dataDum = pd.concat([dataDum, dummies], axis=1)
dataDum.drop(categoric, axis=1, inplace=True)

# outlier
dataOut = dataOut.reindex(sorted(dataOut.columns), axis=1)
for category in categoric:
    series = dataOut[category]
    dummies = pd.get_dummies(series, prefix=category)
    dataOut = pd.concat([dataOut, dummies], axis=1)
dataOut.drop(categoric, axis=1, inplace=True)

# Fill na in numeric categories

In [12]:
# Normal
for num in numeric:
    if dataDum[num].isna().sum() == 0:
        continue
    else:
        dataDum[num].fillna(dataDum[num].median(), inplace=True)
        
# Outlier
for num in numeric:
    if dataOut[num].isna().sum() == 0:
        continue
    else:
        dataOut[num].fillna(dataOut[num].median(), inplace=True)

In [13]:
# check dataset
dataDum.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1451 entries, 1 to 2050
Columns: 258 entries, 1stFloorArea to GarageType_Detchd
dtypes: float64(11), int64(26), uint8(221)
memory usage: 743.9 KB


In [14]:
dataOut.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 600 entries, 0 to 2048
Columns: 266 entries, 1stFloorArea to GarageType_Detchd
dtypes: float64(11), object(26), uint8(229)
memory usage: 312.3+ KB


In [15]:
X = dataDum
Xout = dataOut

# Test data

In [16]:
testDum = test.copy()
testDum.drop(features_drop, axis=1, inplace=True)
testDum = testDum.reindex(sorted(testDum.columns), axis=1)

In [17]:
temp = testDum[numeric].copy()
tempTest = (temp.sub(mean_, axis=1)).div(std_, axis=1)

In [18]:
labels = labels.drop('SalePrice')

In [19]:
testOut = pd.DataFrame(columns=labels)
for i in tempTest.index.values:
    for col in tempTest.columns.values:
        if abs(tempTest.loc[i, col]) > 4:
            testOut = testOut.append(testDum.loc[i])
            testDum.drop(i, inplace=True)
            break

In [20]:
# Normal
for category in categoric:
    series = testDum[category]
    dummies = pd.get_dummies(series, prefix=category)
    testDum = pd.concat([testDum, dummies], axis=1)
    
# Outliers
for category in categoric:
    series = testOut[category]
    dummies = pd.get_dummies(series, prefix=category)
    testOut = pd.concat([testOut, dummies], axis=1)

In [21]:
for num in numeric:
    if testDum[num].isna().sum() == 0:
        continue
    else:
        testDum[num].fillna(testDum[num].median(), inplace=True)
        
for num in numeric:
    if testOut[num].isna().sum() == 0:
        continue
    else:
        testOut[num].fillna(testOut[num].median(), inplace=True)

In [22]:
y = dataDum['SalePrice']
yOut = dataOut['SalePrice']

In [23]:
testDum.drop(categoric, axis=1, inplace=True)
testOut.drop(categoric, axis=1, inplace=True)

In [33]:
true_features = testDum.columns.values.tolist()

In [34]:
dropped = ['KitchenQuality_Po', 'Neighborhood_Landmrk', 'Exterior1_PreCast', 'Exterior2_PreCast',
           'Exterior2_Stone',
           'ProximityToMainRoad2_PosN', 'HeatingQuality_Po', 'HeatingType_Wall']
for f in dropped:
    true_features.remove(f)
    
X = X[true_features]
testDum.drop(dropped, axis=1, inplace=True)


In [35]:
true_features_out = testOut.columns.values.tolist()

In [36]:
dropped = ['ProximityToMainRoad2_Artery', 'ProximityToMainRoad2_PosA',
           'ProximityToMainRoad2_RRAe', 'SaleType_Con', 'SaleType_VWD',
           'RoofMaterial_Roll', 'Foundation_Wood', 'HouseStyle_1.5Unf',
           'Functional_Sal', 'ExteriorCond_Po', 'GarageQuality_Po', 'Exterior2_CBlock']

for f in dropped:
    true_features_out.remove(f)
    
Xout = Xout[true_features_out]
testOut.drop(dropped, axis=1, inplace=True)


In [37]:
Xtest = testDum
XtestOut = testOut

In [38]:
OutInd = XtestOut.index
NormInd = Xtest.index

# GBR

In [39]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

In [40]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [41]:
best_trees = 0
score = 0
for i in [250, 300, 350, 400, 450, 500]:
    clf = ensemble.GradientBoostingRegressor(n_estimators = i, max_depth = 3, min_samples_split = 2,
                                             learning_rate = 0.1, loss = 'ls')
    clf.fit(X_train, y_train)
    temp = clf.score(X_val, y_val)
    print("score for trees " + str(i) +" is: " + str(temp))
    if temp > score:
        best_trees = i
        score = temp

score for trees 250 is: 0.9316952001726577
score for trees 300 is: 0.9320355860355257
score for trees 350 is: 0.9332977650969084
score for trees 400 is: 0.932439864478728
score for trees 450 is: 0.9321421667950491
score for trees 500 is: 0.9323002339052194


In [42]:
best_depth = 0
score = 0
for i in [2, 3, 4, 5, 6]:
    clf = ensemble.GradientBoostingRegressor(n_estimators = best_trees, max_depth = i, min_samples_split = 2,
                                             learning_rate = 0.1, loss = 'ls')
    clf.fit(X_train, y_train)
    temp = clf.score(X_val, y_val)
    print("score for depth " + str(i) +" is: " + str(temp))
    if temp > score:
        best_depth = i
        score = temp

score for depth 2 is: 0.92706857100626
score for depth 3 is: 0.9326548749815072
score for depth 4 is: 0.9188135548680502
score for depth 5 is: 0.9208962466849677
score for depth 6 is: 0.9176621483289653


In [43]:
best_sample_split = 0
score = 0
for i in [2, 3, 4, 5, 6, 7]:
    clf = ensemble.GradientBoostingRegressor(n_estimators = best_trees, max_depth = best_depth, min_samples_split = i,
                                             learning_rate = 0.1, loss = 'ls')
    clf.fit(X_train, y_train)
    temp = clf.score(X_val, y_val)
    print("score for min sample split " + str(i) +" is: " + str(temp))
    if temp > score:
        best_sample_split = i
        score = temp

score for min sample split 2 is: 0.9326637377662864
score for min sample split 3 is: 0.9316223264370375
score for min sample split 4 is: 0.9304996279509723
score for min sample split 5 is: 0.9296725380151528
score for min sample split 6 is: 0.930580156621935
score for min sample split 7 is: 0.9309016468283435


In [44]:
print("Best number of estimators:", best_trees)
print("Best depth:", best_depth)
print("Best min sample split:", best_sample_split)

Best number of estimators: 350
Best depth: 3
Best min sample split: 2


In [45]:
clf = ensemble.GradientBoostingRegressor(n_estimators = best_trees, max_depth = best_depth, 
                                         min_samples_split = best_sample_split,
                                         learning_rate = 0.1, loss = 'ls')
clf.fit(X, y)
y_pred = clf.predict(Xtest)
y_predS = pd.Series(y_pred, index=NormInd)

# RIDGE

In [46]:
from sklearn import linear_model

In [47]:
lcv = linear_model.LassoCV(alphas=
                           (10,11,12,13,14,15,16,), cv=5, max_iter=15000, normalize = True)
lcv.fit(X_train, y_train)
print ("Alpha Value: ", (lcv.alpha_))
lcv.score(X_val, y_val)

Alpha Value:  15


0.9324735407284117

In [48]:
rcv = linear_model.RidgeCV(alphas=(3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18),
                           cv=5)
rcv.fit(X_train, y_train)
print ("Alpha Value: ", (rcv.alpha_))
rcv.score(X_val, y_val)

Alpha Value:  9


0.9332068873472752

In [49]:
rcv = linear_model.RidgeCV(alphas=(3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18),
                           cv=5)
rcv.fit(X, y)
print ("Alpha Value: ", (rcv.alpha_))

Alpha Value:  8


In [50]:
y_predR = rcv.predict(Xtest)
y_predRS = pd.Series(y_predR, index=NormInd)

In [51]:
y_predL = lcv.predict(Xtest)

# Outliers

# GBR

In [52]:
X_train, X_val, y_train, y_val = train_test_split(Xout, yOut, test_size=0.2)

In [53]:
best_trees = 0
score = 0
for i in [250, 300, 350, 400, 450, 500]:
    clf = ensemble.GradientBoostingRegressor(n_estimators = i, max_depth = 3, min_samples_split = 2,
                                             learning_rate = 0.1, loss = 'ls')
    clf.fit(X_train, y_train)
    temp = clf.score(X_val, y_val)
    print("score for trees " + str(i) +" is: " + str(temp))
    if temp > score:
        best_trees = i
        score = temp

score for trees 250 is: 0.866190970887135
score for trees 300 is: 0.8587037838815303
score for trees 350 is: 0.858681544319467
score for trees 400 is: 0.8630746369126107
score for trees 450 is: 0.8598427539293414
score for trees 500 is: 0.8597404079054817


In [54]:
best_depth = 0
score = 0
for i in [2, 3, 4, 5, 6]:
    clf = ensemble.GradientBoostingRegressor(n_estimators = best_trees, max_depth = i, min_samples_split = 2,
                                             learning_rate = 0.1, loss = 'ls')
    clf.fit(X_train, y_train)
    temp = clf.score(X_val, y_val)
    print("score for depth " + str(i) +" is: " + str(temp))
    if temp > score:
        best_depth = i
        score = temp

score for depth 2 is: 0.8723962248280464
score for depth 3 is: 0.8597613280585542
score for depth 4 is: 0.8198999125583875
score for depth 5 is: 0.8362219754192045
score for depth 6 is: 0.8460751751635959


In [55]:
best_sample_split = 0
score = 0
for i in [2, 3, 4, 5, 6, 7]:
    clf = ensemble.GradientBoostingRegressor(n_estimators = best_trees, max_depth = best_depth, min_samples_split = i,
                                             learning_rate = 0.1, loss = 'ls')
    clf.fit(X_train, y_train)
    temp = clf.score(X_val, y_val)
    print("score for min sample split " + str(i) +" is: " + str(temp))
    if temp > score:
        best_sample_split = i
        score = temp

score for min sample split 2 is: 0.8659851694145917
score for min sample split 3 is: 0.8718053168498116
score for min sample split 4 is: 0.8725249601852895
score for min sample split 5 is: 0.864851713511851
score for min sample split 6 is: 0.8721617153756607
score for min sample split 7 is: 0.8684403878356588


In [56]:
print("Best number of estimators:", best_trees)
print("Best depth:", best_depth)
print("Best min sample split:", best_sample_split)

Best number of estimators: 250
Best depth: 2
Best min sample split: 4


In [57]:
clf = ensemble.GradientBoostingRegressor(n_estimators = best_trees, max_depth = best_depth, 
                                         min_samples_split = best_sample_split,
                                         learning_rate = 0.1, loss = 'ls')
clf.fit(Xout, yOut)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=4, min_weight_fraction_leaf=0.0,
             n_estimators=250, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [58]:
y_pred_out = clf.predict(XtestOut)

In [59]:
y_pred_outS = pd.Series(y_pred_out, index=OutInd)

In [60]:
XtestOut.columns[36]

'Paved Drive_N'

# RIDGE

In [61]:
from sklearn import linear_model

In [62]:
lcv = linear_model.LassoCV(alphas=
                           (10,11,12,13,14,15,16,), cv=5, max_iter=15000, normalize = True)
lcv.fit(X_train, y_train)
print ("Alpha Value: ", (lcv.alpha_))
lcv.score(X_val, y_val)

Alpha Value:  16


0.8220111587866262

In [63]:
rcv = linear_model.RidgeCV(alphas=(3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18),
                           cv=5)
rcv.fit(X_train, y_train)
print ("Alpha Value: ", (rcv.alpha_))
rcv.score(X_val, y_val)

Alpha Value:  4


0.8267227494918039

In [64]:
rcv = linear_model.RidgeCV(alphas=(3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18),
                           cv=5)
rcv.fit(Xout, yOut)
print ("Alpha Value: ", (rcv.alpha_))

Alpha Value:  10


In [65]:
y_predR_out = rcv.predict(XtestOut)
y_predR_outS = pd.Series(y_predR_out, index=OutInd)

In [72]:
yPredCombOut = 1/2 * y_pred_outS + 1/2 * y_predR_outS

In [73]:
yPredComb = 1/2 * y_predS + 1/2 * y_predRS

In [74]:
yPred = yPredComb.append(yPredCombOut)

# SUBMIT

In [75]:
# create submission file
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = yPred
sub.to_csv('submission.csv',index=False)