In [1]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4
# Pretty display for notebooks
%matplotlib inline

In [2]:
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points
from scipy import stats
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

In [3]:
# Read data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

#check the numbers of samples and features
print("The train data size before dropping Id feature is : {} ".format(train.shape))
print("The test data size before dropping Id feature is : {} ".format(test.shape))

#Save the 'Id' column
train_ID = train['Id']
test_ID = test['Id']

#Now drop the  'Id' colum since it's unnecessary for  the prediction process.
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)

#check again the data size after dropping the 'Id' variable
print("\nThe train data size after dropping Id feature is : {} ".format(train.shape)) 
print("The test data size after dropping Id feature is : {} ".format(test.shape))

The train data size before dropping Id feature is : (1460, 81) 
The test data size before dropping Id feature is : (1459, 80) 

The train data size after dropping Id feature is : (1460, 80) 
The test data size after dropping Id feature is : (1459, 79) 


In [4]:
y = train["SalePrice"]

In [5]:
data = train

In [6]:
len(data) 

1460

In [7]:
len(test)

1459

In [8]:
# 50% cutoff point
cutoff = len(data) // 3
cutoff

486

In [9]:
features_drop = []
features_filled_std = {}
for col in data.columns.values:
    if data[col].isna().sum() > cutoff:
        data.drop(col, axis=1, inplace=True)
        features_drop.append(col)

In [10]:
print("features drop:", features_drop)

features drop: ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']


In [11]:
features = data.columns.values
numeric = []
categoric = []
for col in features:
    if data[col].dtype == 'object':
        categoric.append(col)
    else:
        numeric.append(col)
numeric.remove('SalePrice')

In [12]:
# Transform categorical features in dummy variables

In [13]:
data = data.reindex(sorted(data.columns), axis=1)
for category in categoric:
    series = data[category]
    dummies = pd.get_dummies(series, prefix=category)
    data = pd.concat([data, dummies], axis=1)
data.drop(categoric, axis=1, inplace=True)

In [14]:
# Fill na in numeric categories

In [15]:
for num in numeric:
    if data[num].isna().sum() == 0:
        continue
    else:
        data[num].fillna(data[num].median(), inplace=True)

In [16]:
# check dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 271 entries, 1stFlrSF to SaleCondition_Partial
dtypes: float64(3), int64(34), uint8(234)
memory usage: 755.7 KB


In [17]:
X = data.copy()

In [18]:
# Test data

In [19]:
testDum = test.copy()
testDum = testDum.reindex(sorted(testDum.columns), axis=1)
testDum.drop(features_drop, axis=1, inplace=True)
for category in categoric:
    series = testDum[category]
    dummies = pd.get_dummies(series, prefix=category)
    testDum = pd.concat([testDum, dummies], axis=1)

for num in numeric:
    if testDum[num].isna().sum() == 0:
        continue
    else:
        testDum[num].fillna(testDum[num].median(), inplace=True)

In [20]:
# testDum.drop(dropped_corr, axis=1, inplace=True)
temp = data[numeric]
std_devs = temp.std(axis=0)
means = temp.mean(axis=0)
norm = (testDum[numeric].sub(means,axis=1)).div(std_devs, axis=1)

In [21]:
outlie = []
for col in norm.columns.values:
    if sum(norm[col] > 5) > 5:
        outlie.append(col)
        print(col)

MasVnrArea
BsmtFinSF2
LowQualFinSF
EnclosedPorch
3SsnPorch
MiscVal


In [22]:
testDum.drop(outlie, axis=1, inplace=True)

In [23]:
Xtest = testDum.drop(categoric, axis=1)

In [24]:
true_features = Xtest.columns.values.tolist()

In [25]:
X = X[true_features]

# GBR

In [26]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

In [27]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [28]:
best_trees = 0
score = 0
for i in [250, 300, 350, 400, 450, 500]:
    clf = ensemble.GradientBoostingRegressor(n_estimators = i, max_depth = 3, min_samples_split = 2,
                                             learning_rate = 0.1, loss = 'huber')
    clf.fit(X_train, y_train)
    temp = clf.score(X_val, y_val)
    print("score for trees " + str(i) +" is: " + str(temp))
    if temp > score:
        best_trees = i
        score = temp

score for trees 250 is: 0.9070798256174872
score for trees 300 is: 0.9135624822980495
score for trees 350 is: 0.919113111467827
score for trees 400 is: 0.9152713390050683
score for trees 450 is: 0.9153292456673144
score for trees 500 is: 0.9137634724190284


In [29]:
best_depth = 0
score = 0
for i in [2, 3, 4, 5, 6]:
    clf = ensemble.GradientBoostingRegressor(n_estimators = best_trees, max_depth = i, min_samples_split = 2,
                                             learning_rate = 0.1, loss = 'huber')
    clf.fit(X_train, y_train)
    temp = clf.score(X_val, y_val)
    print("score for depth " + str(i) +" is: " + str(temp))
    if temp > score:
        best_depth = i
        score = temp

score for depth 2 is: 0.9310828772894992
score for depth 3 is: 0.9066226340874908
score for depth 4 is: 0.9108830306978162
score for depth 5 is: 0.9090833955433468
score for depth 6 is: 0.8861720169538837


In [30]:
best_sample_split = 0
score = 0
for i in [2, 3, 4, 5, 6, 7]:
    clf = ensemble.GradientBoostingRegressor(n_estimators = best_trees, max_depth = best_depth, min_samples_split = i,
                                             learning_rate = 0.1, loss = 'huber')
    clf.fit(X_train, y_train)
    temp = clf.score(X_val, y_val)
    print("score for min sample split " + str(i) +" is: " + str(temp))
    if temp > score:
        best_sample_split = i
        score = temp

score for min sample split 2 is: 0.9245000374842152
score for min sample split 3 is: 0.9330010809079379
score for min sample split 4 is: 0.9313051245240195
score for min sample split 5 is: 0.9296100703353748
score for min sample split 6 is: 0.9311054726462186
score for min sample split 7 is: 0.9279808535683033


In [31]:
print("Best number of estimators:", best_trees)
print("Best depth:", best_depth)
print("Best min sample split:", best_sample_split)

Best number of estimators: 350
Best depth: 2
Best min sample split: 3


In [32]:
clf = ensemble.GradientBoostingRegressor(n_estimators = best_trees, max_depth = best_depth, 
                                         min_samples_split = best_sample_split,
                                         learning_rate = 0.1, loss = 'ls')
clf.fit(X, y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=3, min_weight_fraction_leaf=0.0,
             n_estimators=350, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [33]:
y_pred = clf.predict(Xtest)

In [34]:
import pickle

In [35]:
filename = 'GBR_best.sav'
pickle.dump(clf, open(filename, 'wb'))

# RIDGE

In [36]:
from sklearn import linear_model
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler

In [37]:
rcv = linear_model.RidgeCV(alphas=(12,13,14,15,16,17,18),
                           cv=5)
rcv.fit(X, y)
print ("Alpha Value: ", (rcv.alpha_))

Alpha Value:  15


In [38]:
ecv = make_pipeline(RobustScaler(), linear_model.ElasticNetCV(alphas=(0.001,0.0009,0.0008,0.0007, 0.0006, 0.0005),
                           cv=5))
ecv.fit(X, y)

Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('elasticnetcv', ElasticNetCV(alphas=(0.001, 0.0009, 0.0008, 0.0007, 0.0006, 0.0005),
       copy_X=True, cv=5, eps=0.001, fit_intercept=True, l1_ratio=0.5,
       max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
       positive=False, precompute='auto', random_state=None,
       selection='cyclic', tol=0.0001, verbose=0))])

In [39]:
y_predR = rcv.predict(Xtest)

In [40]:
y_predE = ecv.predict(Xtest)

In [41]:
filename = 'Ridge_best.sav'
pickle.dump(rcv, open(filename, 'wb'))

filename = 'ENet_best.sav'
pickle.dump(ecv, open(filename, 'wb'))

# SUBMIT

In [42]:
y_comb = 1/2 * y_pred + 1/4 * y_predR + 1/4* y_predE

In [43]:
# create submission file
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = y_comb
sub.to_csv('submission.csv',index=False)

In [44]:
y_comb

array([111094.60291707, 159413.13451673, 177492.22590747, ...,
       160585.9437055 , 113074.09831543, 232392.54925399])