In [None]:
#Steps to follow when working with real world data
#1. Load data
#2. Look at data to remove irrelevant data
#3. Finding missing values 
#4. Understand categories
#5. Pick a model

# Lets look at a real world problem such as home price prediction

In [None]:
#pip install catboost
#pip install xgboost

In [350]:
##Load data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
import catboost as cb

In [351]:
# Load the data
df = pd.read_csv('train.csv')
df2 = pd.read_csv('test.csv')

In [352]:
# Select output target
y = df.SalePrice
x = df.drop(['SalePrice'], axis=1)

In [330]:
#============================DATA HANDLING ============================##

In [331]:
## Handling missing values

In [353]:
from collections import Counter
#num_col = x.loc[:,'MSSubClass':'SaleCondition'].select_dtypes(exclude=['object']).columns
# Outlier detection 

def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.7 * IQR ## increased to 1.7
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers   

# detect outliers 
Outliers_to_drop = detect_outliers(x,2, x.select_dtypes(exclude=['object']))
x.loc[Outliers_to_drop] # Show the outliers rows
x = x.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
y = y.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
x.shape

(1356, 80)

In [333]:
y.shape

(1356,)

In [354]:
# before tuning
def basic_details(df):
    b = pd.DataFrame()
    b['Missing value'] = df.isnull().sum()
    b['N unique value'] = df.nunique()
    b['dtype'] = df.dtypes
    return b
basic_details(x)



Unnamed: 0,Missing value,N unique value,dtype
Id,0,1356,int64
MSSubClass,0,15,int64
MSZoning,0,5,object
LotFrontage,230,107,float64
LotArea,0,997,int64
...,...,...,...
MiscVal,0,16,int64
MoSold,0,12,int64
YrSold,0,5,int64
SaleType,0,9,object


In [355]:
df = x

df['MSZoning'].fillna('N')
df['LotFrontage'].fillna(df['LotFrontage'].median(), inplace = True)
df['Alley'].fillna('N',inplace=True)
df['Exterior1st'].fillna('N')
df['Exterior2nd'].fillna('N')
df['Utilities'].fillna('N')
df['MasVnrType'].fillna('N',inplace=True)
df['BsmtFullBath'].fillna(0)
df['BsmtHalfBath'].fillna(0)
df['FullBath'].fillna(0)
df['HalfBath'].fillna(0)
df['KitchenQual'].fillna('N')
df['Functional'].fillna('N')
df['FireplaceQu'].fillna('N',inplace=True)
df['GarageType'].fillna('N',inplace=True)
df['GarageYrBlt'].fillna(0,inplace=True)
df['GarageFinish'].fillna('N',inplace=True)
df['GarageCars'].fillna(0)
df['GarageArea'].fillna(0,inplace=True)
df['GarageQual'].fillna('N',inplace=True)
df['GarageCond'].fillna('N',inplace=True)
df['BsmtFinSF2'].fillna(0,inplace=True)
df['MasVnrArea'].fillna(0,inplace=True)
df['BsmtFinSF1'].fillna(0,inplace=True)
df['SaleType'].fillna('N')
df['BsmtUnfSF'].fillna(0,inplace=True)
df['TotalBsmtSF'].fillna(0,inplace=True)
df['PoolQC'].fillna('N',inplace=True)
df['Fence'].fillna('N',inplace=True)
df['MiscFeature'].fillna('N',inplace=True)
df['BsmtQual'].fillna('N',inplace=True)
df['BsmtCond'].fillna('N',inplace=True)
df['BsmtExposure'].fillna('N',inplace=True)
df['BsmtFinType1'].fillna('N',inplace=True)
df['BsmtFinType2'].fillna('N',inplace=True)
df['Electrical'].fillna('N',inplace=True)
df["AllSF"] = df["GrLivArea"] + df["TotalBsmtSF"]
df['Area'] = df['LotArea']*df['LotFrontage']
df['Area_log'] = np.log1p(df['Area'])

# Number of missing values in each column of training data
missing_val_count_by_column = (df.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])
x= df

Series([], dtype: int64)


In [356]:
df = df2
df['MSZoning'].fillna('N',inplace=True)
df['LotFrontage'].fillna(df['LotFrontage'].median(), inplace = True)
df['Alley'].fillna('N',inplace=True)
df['Exterior1st'].fillna('N',inplace=True)
df['Exterior2nd'].fillna('N',inplace=True)
df['Utilities'].fillna('N',inplace=True)
df['MasVnrType'].fillna('N',inplace=True)
df['BsmtFullBath'].fillna(0,inplace=True)
df['BsmtHalfBath'].fillna(0,inplace=True)
df['FullBath'].fillna(0)
df['HalfBath'].fillna(0)
df['KitchenQual'].fillna('N',inplace=True)
df['Functional'].fillna('N',inplace=True)
df['FireplaceQu'].fillna('N',inplace=True)
df['GarageType'].fillna('N',inplace=True)
df['GarageYrBlt'].fillna(0,inplace=True)
df['GarageFinish'].fillna('N',inplace=True)
df['GarageCars'].fillna(0,inplace=True)
df['GarageArea'].fillna(0,inplace=True)
df['GarageQual'].fillna('N',inplace=True)
df['GarageCond'].fillna('N',inplace=True)
df['BsmtFinSF2'].fillna(0,inplace=True)
df['MasVnrArea'].fillna(0,inplace=True)
df['BsmtFinSF1'].fillna(0,inplace=True)
df['SaleType'].fillna('N',inplace=True)
df['BsmtUnfSF'].fillna(0,inplace=True)
df['TotalBsmtSF'].fillna(0,inplace=True)
df['PoolQC'].fillna('N',inplace=True)
df['Fence'].fillna('N',inplace=True)
df['MiscFeature'].fillna('N',inplace=True)
df['BsmtQual'].fillna('N',inplace=True)
df['BsmtCond'].fillna('N',inplace=True)
df['BsmtExposure'].fillna('N',inplace=True)
df['BsmtFinType1'].fillna('N',inplace=True)
df['BsmtFinType2'].fillna('N',inplace=True)
df['Electrical'].fillna('N',inplace=True)
df["AllSF"] = df["GrLivArea"] + df["TotalBsmtSF"]
df['Area'] = df['LotArea']*df['LotFrontage']
df['Area_log'] = np.log1p(df['Area'])

x2=df2
missing_val_count_by_column = (x2.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Series([], dtype: int64)


In [337]:
def descrictive_stat_feat(df):
    df = pd.DataFrame(df)
    dcol= [c for c in df.columns if df[c].nunique()>=10]
    d_median = df[dcol].median(axis=0)
    d_mean = df[dcol].mean(axis=0)
    q1 = df[dcol].apply(np.int64).quantile(0.25)
    q3 = df[dcol].apply(np.int64).quantile(0.75)
    
    #Add mean and median column to data set having more then 10 categories
    for c in dcol:
        df[c+str('_median_range')] = (df[c].astype(np.float32).values > d_median[c]).astype(np.int8)
        df[c+str('_mean_range')] = (df[c].astype(np.float32).values > d_mean[c]).astype(np.int8)
        df[c+str('_q1')] = (df[c].astype(np.float32).values < q1[c]).astype(np.int8)
        df[c+str('_q3')] = (df[c].astype(np.float32).values > q3[c]).astype(np.int8)
    return df

df = x
print(df.shape)
numericCols = (df.iloc[:, (np.where((df.dtypes == np.int64) | (df.dtypes == np.float64)))[0]].columns)
numericCols = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'AllSF', 'Area', 'Area_log']

x = descrictive_stat_feat(df[numericCols])
#print(newCols)
#df = df.assign(newCols)
print(x.shape)

df = x2
print(df.shape)
numericCols = (df.iloc[:, (np.where((df.dtypes == np.int64) | (df.dtypes == np.float64)))[0]].columns)
numericCols = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'AllSF', 'Area', 'Area_log']
x2 = descrictive_stat_feat(df[numericCols])
print(x2.shape)



(1356, 83)
(1356, 151)
(1459, 83)
(1459, 151)


  df[c+str('_q1')] = (df[c].astype(np.float32).values < q1[c]).astype(np.int8)
  df[c+str('_q3')] = (df[c].astype(np.float32).values > q3[c]).astype(np.int8)
  df[c+str('_median_range')] = (df[c].astype(np.float32).values > d_median[c]).astype(np.int8)
  df[c+str('_mean_range')] = (df[c].astype(np.float32).values > d_mean[c]).astype(np.int8)
  df[c+str('_q1')] = (df[c].astype(np.float32).values < q1[c]).astype(np.int8)
  df[c+str('_q3')] = (df[c].astype(np.float32).values > q3[c]).astype(np.int8)
  df[c+str('_median_range')] = (df[c].astype(np.float32).values > d_median[c]).astype(np.int8)
  df[c+str('_mean_range')] = (df[c].astype(np.float32).values > d_mean[c]).astype(np.int8)
  df[c+str('_q1')] = (df[c].astype(np.float32).values < q1[c]).astype(np.int8)
  df[c+str('_q3')] = (df[c].astype(np.float32).values > q3[c]).astype(np.int8)
  df[c+str('_median_range')] = (df[c].astype(np.float32).values > d_median[c]).astype(np.int8)
  df[c+str('_mean_range')] = (df[c].astype(np.float32).valu

In [357]:
x.shape

(1356, 83)

In [358]:
# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(x, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [41]:
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()]
cols_with_missing
# Make copy to avoid changing original data (when imputing)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

# Make new columns indicating what will be imputed
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

# Imputation
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer( strategy='constant', fill_value="None")
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

X_train = imputed_X_train_plus
X_valid = imputed_X_valid_plus

In [None]:
## Handling categorial values with ordinal and one-hot encoding

In [359]:
# Categorical columns in the training data
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(X_valid[col]).issubset(set(X_train[col]))]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)


from sklearn.preprocessing import OrdinalEncoder

# Drop categorical columns that will not be encoded
oe_X_train = X_train.drop(bad_label_cols, axis=1)
oe_X_valid = X_valid.drop(bad_label_cols, axis=1)
oe_x2_test = x2.drop(bad_label_cols, axis=1)

# Apply ordinal encoder 
ordinal_encoder = OrdinalEncoder()
oe_X_train[good_label_cols] = ordinal_encoder.fit_transform(oe_X_train[good_label_cols])
oe_X_valid[good_label_cols] = ordinal_encoder.transform(oe_X_valid[good_label_cols])
oe_x2_test[good_label_cols] = ordinal_encoder.fit_transform(oe_x2_test[good_label_cols])
# Categorical columns in the test data
#object_cols = [col for col in oe_x2_test.columns if oe_x2_test[col].dtype == "object"]
#oe_x2_test[object_cols] = ordinal_encoder.fit_transform(oe_x2_test[object_cols])

Categorical columns that will be ordinal encoded: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'SaleType', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['Exterior1st', 'MiscFeature', 'RoofStyle', 'Heating', 'Condition2', 'RoofMatl']


In [360]:
oe_X_train.shape

(1084, 77)

In [361]:
oe_x2_test.shape

(1459, 77)

In [362]:

##One - hot encoding

# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])

## Identifying cardinality
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 20 and 
                        X_train[cname].dtype == "object"]
low_cardinality_cols



# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 20]

# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

#print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)
#print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)

from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(x2[low_cardinality_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
OH_cols_test.index = x2.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
num_X_test = x2.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)


In [398]:
#Model 1 CatBoost
from sklearn.metrics import mean_absolute_error
import catboost as cb
from sklearn.model_selection import GridSearchCV


# Function for comparing different approaches
def cbf(X_train, X_valid, y_train, y_valid, test):
    model = cb.CatBoostRegressor(loss_function='RMSE', silent=True)
    
    
    params = {'iterations': [500],
          'depth': [4, 5, 6],
          'l2_leaf_reg': np.logspace(-20, -19, 3),
          'leaf_estimation_iterations': [10],
#           'eval_metric': ['Accuracy'],
#           'use_best_model': ['True'],
          'random_seed': [42]
         }
    
   # Grid_CBC = GridSearchCV(estimator=model, param_grid = params, cv = 2, n_jobs=-1)
  #  Grid_CBC.fit(X_train, y_train)
    
 #    print("\n The best estimator across ALL searched params:\n",Grid_CBC.best_estimator_)
#    print("\n The best score across ALL searched params:\n",Grid_CBC.best_score_)
#    print("\n The best parameters across ALL searched params:\n",Grid_CBC.best_params_)
    
    model = cb.CatBoostRegressor(loss_function='RMSE', silent=True,
                                iterations=500,
                           depth=4,
                           l2_leaf_reg=1e-20,
                           eval_metric='Accuracy',
                           leaf_estimation_iterations=10,
                           random_seed=42)
    model.fit(X_train, y_train)

    preds = model.predict(X_valid)

    test_preds = model.predict(test)


    return (mean_absolute_error(y_valid, preds), r2_score(y_valid,preds), test_preds)

In [364]:
#Model 2 XGBoost
import xgboost as xgb

def xg(x_train, x_valid, y_train, y_valid, test):

    d_train = xgb.DMatrix(x_train, label=y_train)
    d_valid = xgb.DMatrix(x_valid, label=y_valid)
    d_test = xgb.DMatrix(test)
    params = {
        'objective':'reg:linear',
#         'n_estimators': 50,
        'booster':'gbtree',
        'max_depth':2,
        'eval_metric':'rmse',
        'learning_rate':0.1, 
        'min_child_weight':1,
        'subsample':0.80,
        'colsample_bytree':0.81,
        'seed':45,
        'reg_alpha':1,#1e-03,
        'reg_lambda':0,
        'gamma':0,
        'nthread':-1

    }

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    clf = xgb.train(params, d_train, 2000,  watchlist, early_stopping_rounds=300, maximize=False, verbose_eval=10)
    preds = clf.predict(d_valid)
    test_preds = clf.predict(d_test)
    return (mean_absolute_error(y_valid, preds), r2_score(y_valid,preds), test_preds)

In [399]:
print("MAE from Approach 1 (OE):") 
a,b, oe = cbf(oe_X_train, oe_X_valid, y_train, y_valid, oe_x2_test)
print(a,b,oe)

MAE from Approach 1 (OE):


CatBoostError: catboost/private/libs/target/target_converter.cpp:379: Target with classes must contain only 2 unique values for binary classification

In [393]:
#Deleting uncommon columns
a = OH_X_train.columns.intersection(OH_X_test.columns) 
print(type(OH_X_test.columns))

c = [ element for element in OH_X_train.columns if element not in OH_X_test.columns] 
d = [ element for element in OH_X_test.columns if element not in OH_X_train.columns] 
print(c)
OH_X_test = OH_X_test.reindex(columns=OH_X_train.columns)

print("MAE from Approach 1 (OH):") 
a,b, OH = cbf(OH_X_train, OH_X_valid, y_train, y_valid, OH_X_test)
print(a,b,OH)



<class 'pandas.core.indexes.base.Index'>
[]
MAE from Approach 1 (OH):
 Results from Grid Search 

 The best estimator across ALL searched params:
 <catboost.core.CatBoostRegressor object at 0x284269220>

 The best score across ALL searched params:
 0.8990520580428699

 The best parameters across ALL searched params:
 {'depth': 4, 'iterations': 500, 'l2_leaf_reg': 1e-20, 'leaf_estimation_iterations': 10, 'random_seed': 42}
13656.178898672426 0.9280996016805295 [127779.43740156 164239.04434433 187817.00157925 ... 178184.84615299
 117657.39765796 233055.4372881 ]


In [367]:
print("MAE from Approach 1 (OE):") 
print( xg(oe_X_train, oe_X_valid, y_train, y_valid, oe_x2_test))

print("MAE from Approach 1 (OE):") 
print(xg(OH_X_train, OH_X_valid, y_train, y_valid, OH_X_test))

MAE from Approach 1 (OE):
[0]	train-rmse:172248.92719	valid-rmse:180236.17487
[10]	train-rmse:67299.78653	valid-rmse:72052.10328
[20]	train-rmse:34121.78279	valid-rmse:38317.44767
[30]	train-rmse:24486.95169	valid-rmse:29432.92648
[40]	train-rmse:21290.14306	valid-rmse:26772.89963
[50]	train-rmse:19657.25702	valid-rmse:25858.11881
[60]	train-rmse:18564.76885	valid-rmse:25232.49609
[70]	train-rmse:17705.37334	valid-rmse:24717.77149
[80]	train-rmse:17084.35546	valid-rmse:24444.88362
[90]	train-rmse:16572.50259	valid-rmse:24141.12435
[100]	train-rmse:16047.75333	valid-rmse:23777.37514
[110]	train-rmse:15619.82576	valid-rmse:23676.70320
[120]	train-rmse:15138.33283	valid-rmse:23429.04089
[130]	train-rmse:14801.22122	valid-rmse:23435.74399




[140]	train-rmse:14440.84537	valid-rmse:23235.20309
[150]	train-rmse:14139.71990	valid-rmse:23089.31826
[160]	train-rmse:13904.64635	valid-rmse:23009.35740
[170]	train-rmse:13674.93463	valid-rmse:22902.63028
[180]	train-rmse:13440.65882	valid-rmse:22798.05267
[190]	train-rmse:13229.27021	valid-rmse:22868.35105
[200]	train-rmse:13004.53606	valid-rmse:22789.18748
[210]	train-rmse:12815.40017	valid-rmse:22798.48955
[220]	train-rmse:12623.56237	valid-rmse:22797.75444
[230]	train-rmse:12418.67460	valid-rmse:22690.13044
[240]	train-rmse:12231.43755	valid-rmse:22630.57117
[250]	train-rmse:12018.91891	valid-rmse:22588.66131
[260]	train-rmse:11858.89556	valid-rmse:22542.14932
[270]	train-rmse:11735.77633	valid-rmse:22552.80823
[280]	train-rmse:11576.36169	valid-rmse:22484.62621
[290]	train-rmse:11407.59172	valid-rmse:22470.51602
[300]	train-rmse:11272.17972	valid-rmse:22427.42714
[310]	train-rmse:11122.26951	valid-rmse:22444.05566
[320]	train-rmse:10966.50057	valid-rmse:22388.66245
[330]	train-

[1730]	train-rmse:3511.10896	valid-rmse:21328.30255
[1740]	train-rmse:3486.12742	valid-rmse:21330.78431
[1750]	train-rmse:3466.94563	valid-rmse:21345.60339
[1760]	train-rmse:3442.24622	valid-rmse:21351.62813
[1770]	train-rmse:3423.59283	valid-rmse:21345.55541
[1780]	train-rmse:3400.67721	valid-rmse:21344.95492
[1790]	train-rmse:3380.54623	valid-rmse:21347.84769
[1800]	train-rmse:3360.41301	valid-rmse:21334.08541
[1810]	train-rmse:3338.23958	valid-rmse:21347.30748
[1820]	train-rmse:3317.42668	valid-rmse:21331.37899
[1830]	train-rmse:3290.11859	valid-rmse:21343.82060
[1840]	train-rmse:3270.52087	valid-rmse:21341.43758
[1849]	train-rmse:3252.71661	valid-rmse:21340.57331
(14094.901640050552, 0.9262188927320376, array([130762.04, 164400.8 , 190541.88, ..., 176782.62, 123190.79,
       222714.34], dtype=float32))
MAE from Approach 1 (OE):
[0]	train-rmse:172248.92719	valid-rmse:180236.17487
[10]	train-rmse:67448.79424	valid-rmse:72680.75035
[20]	train-rmse:34138.94363	valid-rmse:38764.54910
[



[100]	train-rmse:15791.08464	valid-rmse:23528.61373
[110]	train-rmse:15336.67062	valid-rmse:23281.94037
[120]	train-rmse:14931.61472	valid-rmse:23206.14722
[130]	train-rmse:14619.13490	valid-rmse:23094.87331
[140]	train-rmse:14284.91413	valid-rmse:22955.45410
[150]	train-rmse:13988.11845	valid-rmse:22883.24823
[160]	train-rmse:13781.16420	valid-rmse:22780.54534
[170]	train-rmse:13535.99009	valid-rmse:22699.52227
[180]	train-rmse:13332.03342	valid-rmse:22713.07109
[190]	train-rmse:13157.96851	valid-rmse:22643.21445
[200]	train-rmse:12954.94097	valid-rmse:22632.56334
[210]	train-rmse:12764.04849	valid-rmse:22608.00666
[220]	train-rmse:12566.68327	valid-rmse:22556.61613
[230]	train-rmse:12411.19316	valid-rmse:22548.66560
[240]	train-rmse:12253.39909	valid-rmse:22569.84701
[250]	train-rmse:12065.57803	valid-rmse:22506.83030
[260]	train-rmse:11889.53005	valid-rmse:22524.30199
[270]	train-rmse:11731.94100	valid-rmse:22573.01860
[280]	train-rmse:11569.33022	valid-rmse:22470.42228
[290]	train-

In [368]:
id2= oe_x2_test.Id
idcol = id2.to_numpy()
print(len(idcol))
print(len(oe))
import numpy as np
print (np.concatenate((idcol,oe),axis=0))
sub = pd.DataFrame({'Id':idcol, 'SalePrice':oe})
print(sub)
sub.to_csv("submission.csv")

1459
1459
[  1461.           1462.           1463.         ... 166824.75855992
 115810.93905396 235515.12760543]
        Id      SalePrice
0     1461  126753.768159
1     1462  162570.546116
2     1463  180942.599352
3     1464  190389.439780
4     1465  184378.625096
...    ...            ...
1454  2915   81739.063200
1455  2916   83960.371853
1456  2917  166824.758560
1457  2918  115810.939054
1458  2919  235515.127605

[1459 rows x 2 columns]


In [None]:
print("MAE from Approach 1 (OE):") 
print(rf(oe_X_train, oe_X_valid, y_train, y_valid))
print("MAE from Approach 3 (One-Hot Encoding):") 
print(rf(OH_X_train, OH_X_valid, y_train, y_valid))

In [None]:
print("MAE from Approach 1 (OE):") 
print(cbf(oe_X_train, oe_X_valid, y_train, y_valid))

In [None]:
print("MAE from Approach 3 (One-Hot Encoding):") 
print(cbf(OH_X_train, OH_X_valid, y_train, y_valid))

In [None]:
#Model 3 XGBoost

def myAutoXG(colsample, lr, md, al, ests):
    xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = colsample, learning_rate = lr,
                max_depth = md, alpha = al, n_estimators = ests)
    xg_reg.fit(oe_X_train,y_train)

    y_pred = xg_reg.predict(oe_X_valid)

    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    #print("Depth - 500, est : " + str(ests) + " lr = ",  lr ," RMSE: %f" % (rmse) + "R2 Score = " + str(r2_score(y_valid,y_pred)))
    return r2_score(y_valid,y_pred)

In [None]:
bestScore = 0
for colsample in np.arange(0.1, 0.99, 0.1):
    for lr in np.arange (0.44,1, 0.1):
         for md in range (100,1000, 10):
            for al in range (1, 100,10):
                for ests in range (100, 500, 10):
                        newScore = myAutoXG(colsample, lr, md, al, ests)
                        if bestScore + 0.1 < newScore:
                            bestScore = newScore
                            print (colsample, lr, md, al, ests, newScore)
print("\n")

In [None]:
eval_set = [(x_train, y_train), (x_test, y_test)]
eval_metric = ["error", "rmse"]
%time xg_reg.fit(x_train, y_train, eval_metric=eval_metric, eval_set=eval_set, verbose=True)
