In [57]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
#models:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor,\
GradientBoostingRegressor, VotingRegressor, BaggingRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVC
from xgboost import XGBRegressor
##
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
import seaborn as sns
import pickle
%matplotlib inline

In [58]:
# Read the train and test datasets
df_train = pd.read_csv("dataset/train_poly.csv")
df_test = pd.read_csv("dataset/test_poly.csv")

cols_train = df_train.columns.tolist()
cols_test = df_test.columns.tolist()

# Train the model with columns that exist both in train and test set
cols_to_train = [col for col in cols_train if col in cols_test]
cols_to_train.remove('Id')

In [59]:
X_train_i = df_train[cols_to_train]
Y_train = df_train['SalePrice']
X_test_i = df_test[cols_to_train]
X_Id = df_test['Id']

In [60]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_i)
X_test = scaler.transform(X_test_i)

In [61]:
print("Training(+validation) set shape : {}".format(X_train.shape))
print("Y_train shape : {}".format(Y_train.shape))
print("Test set shape : {}".format(X_test.shape))

Training(+validation) set shape : (1448, 244)
Y_train shape : (1448,)
Test set shape : (1459, 244)


In [50]:
MLA = [
#     LinearRegression(),
#     LogisticRegression(solver='sag',max_iter=300),
#     GaussianNB(),
# #     MLPRegressor(max_iter=1000,verbose=1),
#     GradientBoostingRegressor(),
# #     VotingRegressor(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)]),
#     BaggingRegressor(),
#     ExtraTreesRegressor(),
#     DecisionTreeRegressor(),
# #     SVC(probability=True),
#     KNeighborsRegressor(n_neighbors = 4),
#     RandomForestRegressor(n_estimators = 100),
    XGBRegressor(learning_rate=0.01, n_estimators=3460,
                max_depth=3, min_child_weight=0,
                gamma=0, subsample=0.7,
                colsample_bytree=0.7,
                objective='reg:squarederror', nthread=-1,
                scale_pos_weight=1, seed=27,
                reg_alpha=0.00006)
]

In [66]:
XGBRegressor?

In [51]:
#split dataset in cross-validation with this splitter class: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html#sklearn.model_selection.ShuffleSplit
#note: this is an alternative to train_test_split
cv_split = ShuffleSplit(n_splits = 10, test_size = .20, train_size = .80, \
                                                random_state = 0 )
                    # run model 10x with 60/30 split intentionally leaving out 10%

#create table to compare MLA metrics
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Error Mean', 'MLA Test Error Mean', \
               'MLA Test Error 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)

#create table to compare MLA predictions
MLA_predict = Y_train.copy()

#index through MLA and save performance to table
row_index = 0
for alg in MLA:

    #set name and parameters
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    
    print("Running the classification on %s" %(MLA_name))
    
    #score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
    cv_results = cross_validate(alg, X_train, Y_train, cv = cv_split,return_train_score=\
                                             True, scoring='neg_mean_absolute_error')

    MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    MLA_compare.loc[row_index, 'MLA Train Error Mean'] = cv_results['train_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Error Mean'] = cv_results['test_score'].mean()   
    #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, 
    #should statistically capture 99.7% of the subsets
    MLA_compare.loc[row_index, 'MLA Test Error 3*STD'] = cv_results['test_score'].std()*3   
    #let's know the worst that can happen!
    

    #save MLA predictions - see section 6 for usage
    alg.fit(X_train, Y_train)
    MLA_predict[MLA_name] = alg.predict(X_train)
    
    row_index+=1

    
#print and sort table: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html
MLA_compare.sort_values(by = ['MLA Test Error Mean'], ascending = False, inplace = True)
MLA_compare
#MLA_predict

Running the classification on XGBRegressor


Unnamed: 0,MLA Name,MLA Parameters,MLA Train Error Mean,MLA Test Error Mean,MLA Test Error 3*STD,MLA Time
0,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",-5750.52,-14051.8,1958.99,6.38973


In [62]:
model = MLA[0]
model.fit(X_train, Y_train)
preds_train = model.predict(X_train)
preds_test = model.predict(X_test)

result = pd.DataFrame({
    'Id':X_Id,
    'SalePrice':preds_test
})

result.to_csv("dataset/result_xgb.csv",index=False)

In [63]:
X_trainn, X_valid, Y_trainn, Y_valid = train_test_split(X_train, Y_train)

In [64]:
def get_mae_valid(model,X_train, X_valid, Y_train, Y_valid):
    model.fit(X_train, Y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(Y_valid, preds)

print(get_mae_valid(model,X_trainn, X_valid, Y_trainn, Y_valid))

14784.280732044199


In [119]:
# TODO:
#     1. Polynomial features
#     2. Grid Search
#     3. Improve ordinal variables
#     4. Feature Selection
#     5. Remove outliers

In [121]:
GradientBoostingRegressor?