In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder,FunctionTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split,cross_val_predict
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
train=pd.read_csv("../data/train.csv")
train['HouseAge']=train.YrSold-train.YearBuilt
train['AvgQuality']=(train.OverallQual+train.OverallCond)/2
train['TotalSF']=train.TotalBsmtSF+train['1stFlrSF']+train['2ndFlrSF']

In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,HouseAge,AvgQuality,TotalSF
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,,0,2,2008,WD,Normal,208500,5,6.0,2566
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,,0,5,2007,WD,Normal,181500,31,7.0,2524
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,,0,9,2008,WD,Normal,223500,7,6.0,2706
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,,0,2,2006,WD,Abnorml,140000,91,6.0,2473
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,,0,12,2008,WD,Normal,250000,8,6.5,3343


In [4]:
y=train.SalePrice
X=train.drop(columns=['SalePrice'])



categorical=train.select_dtypes(include=['object'])
numeric=train.select_dtypes(include=['number']).drop(columns=['SalePrice'])
numeric_column=numeric.columns
categorical_column=categorical.columns

In [5]:
categorical_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore'))
])
numeric_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

In [6]:
preprocessor=ColumnTransformer(transformers=[
    ('num',numeric_transformer,numeric_column),
    ('cat',categorical_transformer,categorical_column)
    
])
pipeline=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',XGBRegressor())
])

In [7]:
pipeline.set_params(
    model__random_state=42,
    model__n_jobs=-1,
    model__n_estimators=1000,
    model__max_depth=5,
    model__subsample=0.9,
    model__colsample_bytree=0.7,
    model__reg_alpha=1,
    model__reg_lambda=0.1,
    model__learning_rate=0.05
)

In [8]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42)
pipeline.fit(X_train,y_train)
pred=pipeline.predict(X_test)
print("error",mean_absolute_error(y_test,pred))

error 15205.1259765625


In [9]:
test=pd.read_csv("../data/test.csv")
test['HouseAge']=test.YrSold-test.YearBuilt
test['AvgQuality']=(test.OverallQual+test.OverallCond)/2
test['TotalSF']=test.TotalBsmtSF+test['1stFlrSF']+test['2ndFlrSF']

In [10]:
test_pred=pipeline.predict(test)

output=pd.DataFrame({"Id": test.Id,"SalePrice": test_pred})
output.to_csv("../predictions/predictions.csv",index=False)
output

Unnamed: 0,Id,SalePrice
0,1461,132090.734375
1,1462,164711.234375
2,1463,189116.484375
3,1464,189915.062500
4,1465,204741.453125
...,...,...
1454,2915,83149.492188
1455,2916,80142.929688
1456,2917,167164.296875
1457,2918,121567.578125
