In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# from collections import Counter
from sklearn.pipeline import Pipeline # for making pipleine 
from sklearn.impute import SimpleImputer # for handling missing variables either categorical or numerical
from sklearn.preprocessing import OneHotEncoder # for one hot encoding categorical variables
from sklearn.metrics import mean_absolute_error # for Mean absolute error
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor # for modelling
# from xgboost.xgbclassifier import XGBClassifier

In [2]:
cols_to_del = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage',
       'GarageCond', 'GarageQual', 'GarageYrBlt', 'GarageFinish', 'GarageType',
       'BsmtCond', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtHalfBath',
        'MasVnrType', 'MasVnrArea', 'Exterior2nd', 'BsmtFinSF2',
 'BsmtFinSF1']

In [3]:
# Read the data
X_full = pd.read_csv('train.csv')
X_test_full = pd.read_csv('test.csv')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# Remove columns that are not needed for the model
for col in cols_to_del:
    X_full = X_full.drop([col], axis=1)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 20 and 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]


# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [4]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='most_frequent')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# handle unknown is set to ignore because sometimes in test set we have variables that were not present in Training set
# and hence were not encoded while training 
# but if we use these variables while testing we will get error hence to ignore these errors we use this argument

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [5]:
# Define the model
# xgb = xgbr(random_state=42,n_estimators=2000,learning_rate=0.055) # Your code here
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)


xgb = XGBRegressor(n_estimators=2000, learning_rate=0.05, n_jobs=4)


# gb_param_grid = {'n_estimators' : [100,200,300],
#               'learning_rate': [0.1, 0.05, 0.01],
#               'max_depth': [4, 8],
#               'min_samples_leaf': [100,150],
#               'max_features': [0.3, 0.1] 
#               }
gb_param_grid_best = {'n_estimators' : [2000],
              'learning_rate': [0.05],
              'max_depth': [4],
              'min_samples_leaf': [100],
              'max_features': [0.3] 
              }

Grid_model = GridSearchCV(xgb, param_grid = gb_param_grid_best,scoring="max_error", n_jobs= 4, verbose = 1)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model2', Grid_model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_full[my_cols], y)
# print(Grid_model.best_params_)

# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   44.8s finished


Parameters: { max_features, min_samples_leaf } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


MAE: 1325.466743364726


# Submission

In [6]:
preds_1 = clf.predict(X_test)
output = pd.DataFrame({'Id': X_test["Id"], 'SalePrice': preds_1})
output.to_csv('my_submission_1.csv', index=False)
result = pd.read_csv('my_submission_1.csv')
result.shape

(1459, 2)