In [41]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import xgboost
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

In [42]:
# Get train and test data
data = pd.read_csv('./input/train.csv')
test_data = pd.read_csv('./input/test.csv')

#### Train and Data

Dividing the data

In [43]:
# y as sale price
y = data.SalePrice
# x as all the columns except the sale price
X = data.drop(['SalePrice'], axis=1)

# split data for training and testing
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y)

# select all catergorical columns with less than 10 unique variables
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]

# select all columns with int64 and float64
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# use selected columns
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

Pipeline for preprocessing data

In [44]:
#replace numerical missing values with the average
numerical_transformer = SimpleImputer(strategy='mean')

# handle categorial columns with one hot encoder
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value="No Data")),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# use transformers to preprocess data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

Model and Predictions

In [45]:
# create new pipeline with the preprocessor
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# preprocess the data
X_train = pipeline.fit_transform(X_train)
X_valid = pipeline.transform(X_valid)

model = XGBRegressor(n_estimators=1000, learning_rate=0.01)

model.set_params(early_stopping_rounds=5)
model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

preds = model.predict(X_valid)

mae = mean_absolute_error(y_valid, preds)
print("Validation MAE: {:.6f}".format(mae))

Validation MAE: 16994.460595


#### Submit Predictions

In [25]:
X_test = pipeline.transform(test_data)

test_preds = model.predict(X_test)

In [26]:
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': test_preds})
output.to_csv('submission.csv', index=False)

Comment on anything I can improve on!