In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/housing-prices-competition-for-kaggle-learn-users/train.csv
/kaggle/input/housing-prices-competition-for-kaggle-learn-users/test.csv


In [2]:
import pandas as pd

# Read the data
X_train = pd.read_csv('../input/housing-prices-competition-for-kaggle-learn-users/train.csv', index_col='Id') 
X_test = pd.read_csv('../input/housing-prices-competition-for-kaggle-learn-users/test.csv', index_col='Id')

print(X_train.shape)
print(X_test.shape)

#Remove rows with missing target, separate target from predictors
X_train.dropna(axis=0, subset=['SalePrice'], inplace=True)
print(X_train.shape)

#Separate target from predictors (i.e. input features)
y_train = X_train.SalePrice
X_train.drop(['SalePrice'], axis=1, inplace=True)

#Perform ordinal encoding to categorical feature columns 
from sklearn.preprocessing import OrdinalEncoder

# Get the columns that contain strings and treat them as categorical
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(X_test[col]).issubset(set(X_train[col]))]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))

print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

# Drop categorical columns that will not be encoded
X_train_ordinal = X_train.drop(bad_label_cols, axis=1)
X_test_ordinal = X_test.drop(bad_label_cols, axis=1)

# Apply ordinal encoder 
ordinal_encoder = OrdinalEncoder() # Your code here
X_train_ordinal[good_label_cols] = ordinal_encoder.fit_transform(X_train_ordinal[good_label_cols])
X_test_ordinal[good_label_cols] = ordinal_encoder.transform(X_test_ordinal[good_label_cols])

#fill missing values in train and test sets using IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer 

final_imputer = IterativeImputer(max_iter=3, random_state=0) 
X_train_imputed = final_imputer.fit_transform(X_train_ordinal)
X_test_imputed = final_imputer.transform(X_test_ordinal)

(1460, 80)
(1459, 79)
(1460, 80)
Categorical columns that will be ordinal encoded: ['Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['KitchenQual', 'Exterior2nd', 'Functional', 'SaleType', 'Exterior1st', 'MSZoning', 'Utilities']


In [3]:
#Question 1a
import xgboost as xgb

model = xgb.XGBRegressor(n_jobs=-1, random_state=0)

print(model.n_estimators)

100


default value of n_estimators=100      default value of learning_rate=0.1

In [4]:
#Question 1b

import xgboost as xgb
import pandas as pd

model = xgb.XGBRegressor(n_jobs=-1, random_state=0)

model.fit(X_train_imputed, y_train)

predictions = model.predict(X_test_imputed)

submission = pd.DataFrame({'Id': X_test.index, 'SalePrice': predictions})

submission.to_csv('submission_default.csv', index=False)

Score: 16346.66286

In [5]:
#Question 2
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

n_estimators_values = [100, 500, 1000]
learning_rate_values = [0.01, 0.05, 0.1]

parameters = {'n_estimators': n_estimators_values, 'learning_rate': learning_rate_values}

model = xgb.XGBRegressor(n_jobs=-1, random_state=0)

opt = GridSearchCV(model, parameters)

opt.fit(X_train_imputed, y_train)

print("Optimum n_estimators:", opt.best_params_['n_estimators'])
print("Optimum learning_rate:", opt.best_params_['learning_rate'])

predictions = opt.predict(X_test_imputed)

submission = pd.DataFrame({'Id': X_test.index, 'SalePrice': predictions})

submission.to_csv('submission_grid_search.csv', index=False)

Optimum n_estimators: 1000
Optimum learning_rate: 0.01


Optimum n_estimators: 1000
Optimum learning_rate: 0.01

Score: 14916.67171

In [6]:
#Question 3
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, loguniform

distributions = {
    'n_estimators': randint(100, 1001),
    'learning_rate': loguniform(0.01, 0.1)
}

model = xgb.XGBRegressor(n_jobs=-1, random_state=0)

opt_rand = RandomizedSearchCV(model, distributions, n_iter=9, n_jobs=-1, random_state=0)

opt_rand.fit(X_train_imputed, y_train)

print("Optimum n_estimators:", opt_rand.best_params_['n_estimators'])
print("Optimum learning_rate:", opt_rand.best_params_['learning_rate'])

predictions = opt_rand.predict(X_test_imputed)

submission = pd.DataFrame({'Id': X_test.index, 'SalePrice': predictions})

submission.to_csv('submission_randomized_search.csv', index=False)

Optimum n_estimators: 586
Optimum learning_rate: 0.024179177243329457


Optimum n_estimators: 586
Optimum learning_rate: 0.024179177243329457

Score: 15011.71971

In [7]:
#Question 4
import xgboost as xgb
import pandas as pd

model = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=-1, random_state=0)

model.fit(X_train_imputed, y_train)

predictions = model.predict(X_test_imputed)

submission = pd.DataFrame({'Id': X_test.index, 'SalePrice': predictions})

submission.to_csv('submission_selected.csv', index=False)

Score: 14615.44802

In [8]:
#Question 5
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train_set, y_train_set, n_estimators, learning_rate):
    
    model = xgb.XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate, n_jobs=-1, random_state=0)
    
    scores = -1 * cross_val_score(model, X_train_set, y_train_set, cv=5, scoring='neg_mean_absolute_error')
    average_mae = scores.mean()
    
    return average_mae

In [9]:
# Question 6
default_mae = score_dataset(X_train_imputed, y_train, 100, 0.1)  # q1
grid_search_mae = score_dataset(X_train_imputed, y_train, opt.best_params_['n_estimators'], opt.best_params_['learning_rate'])  # q2
randomized_search_mae = score_dataset(X_train_imputed, y_train, opt_rand.best_params_['n_estimators'], opt_rand.best_params_['learning_rate'])  # q3
selected_mae = score_dataset(X_train_imputed, y_train, 1000, 0.05)  # q4

print("Average MAE with default hyper-parameters:", default_mae)
print("Average MAE with grid search hyper-parameters:", grid_search_mae)
print("Average MAE with randomized search hyper-parameters:", randomized_search_mae)
print("Average MAE with selected hyper-parameters:", selected_mae)

Average MAE with default hyper-parameters: 16590.95635166952
Average MAE with grid search hyper-parameters: 16130.95675032106
Average MAE with randomized search hyper-parameters: 16162.390344071062
Average MAE with selected hyper-parameters: 16379.611817744008


Average MAE with default hyper-parameters: 16590.95635166952
Average MAE with grid search hyper-parameters: 16162.390344071062
Average MAE with randomized search hyper-parameters: 16162.390344071062
Average MAE with selected hyper-parameters: 16379.611817744008