In [55]:
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, precision_score, recall_score, \
    fbeta_score, roc_auc_score, precision_recall_curve, auc

In [56]:
def fill_null_with_median(df: pd.DataFrame, columns: list | tuple):
    for col in columns:
        df[col] = df[col].fillna(df[col].median())

    return df


def fill_null_with_mode(df: pd.DataFrame, columns: list | tuple):
    for col in columns:
        df[col] = df[col].fillna(df[col].mode()[0])

    return df

In [57]:
df = pd.read_csv('data/train.csv', na_values='?')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [58]:
df.shape

(1460, 81)

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [60]:
df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [61]:
df = fill_null_with_median(df, ['LotFrontage', 'GarageYrBlt', 'MasVnrArea'])
df = fill_null_with_mode(df, ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical',
                              'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'])

y = df['SalePrice']
X = df.drop(columns=['Id', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature', 'SalePrice'])

X.isnull().sum()

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
Length: 74, dtype: int64

In [62]:
X = pd.get_dummies(X, drop_first=True)
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,0,1,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,0,1,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,0,1,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,0,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,0,1,0,0,0,1,0


In [63]:
X = pd.get_dummies(X, columns=['MSSubClass'], drop_first=True)
X.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_160,MSSubClass_180,MSSubClass_190
0,65.0,8450,7,5,2003,2003,196.0,706,0,150,...,1,0,0,0,0,0,0,0,0,0
1,80.0,9600,6,8,1976,1976,0.0,978,0,284,...,0,0,0,0,0,0,0,0,0,0
2,68.0,11250,7,5,2001,2002,162.0,486,0,434,...,1,0,0,0,0,0,0,0,0,0
3,60.0,9550,7,5,1915,1970,0.0,216,0,540,...,0,1,0,0,0,0,0,0,0,0
4,84.0,14260,8,5,2000,2000,350.0,655,0,490,...,1,0,0,0,0,0,0,0,0,0


In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
X_train

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_160,MSSubClass_180,MSSubClass_190
1280,67.0,9808,7,5,2002,2002,110.0,788,0,785,...,0,0,0,0,0,0,0,0,0,0
34,60.0,7313,9,5,2005,2005,246.0,1153,0,408,...,0,0,0,0,0,0,1,0,0,0
617,59.0,7227,6,6,1954,1954,0.0,0,0,832,...,0,0,0,0,0,0,0,0,0,0
1020,60.0,7024,4,5,2005,2005,0.0,1024,0,108,...,0,0,0,0,0,0,0,0,0,0
1187,89.0,12461,8,5,1994,1995,0.0,1456,0,168,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,69.0,13837,7,5,1988,1988,178.0,1002,202,0,...,0,0,0,0,0,0,0,0,0,0
461,60.0,7200,7,9,1936,2007,0.0,350,210,0,...,0,1,0,0,0,0,0,0,0,0
1142,77.0,9965,8,5,2006,2007,340.0,1150,0,316,...,1,0,0,0,0,0,0,0,0,0
30,50.0,8500,4,4,1920,1950,0.0,0,0,649,...,0,1,0,0,0,0,0,0,0,0


# Gradient Boosting

In [65]:
gb = CatBoostRegressor(task_type='GPU')
gb.fit(X_train, y_train)
gb_predicted = gb.predict(X_test)

Learning rate set to 0.040509
0:	learn: 79497.1334584	total: 9.56ms	remaining: 9.55s
1:	learn: 77332.7297671	total: 24.7ms	remaining: 12.3s
2:	learn: 75250.3189770	total: 58.6ms	remaining: 19.5s
3:	learn: 73311.5054844	total: 66.7ms	remaining: 16.6s
4:	learn: 71417.9422131	total: 74ms	remaining: 14.7s
5:	learn: 69550.5728572	total: 81.4ms	remaining: 13.5s
6:	learn: 67763.2492178	total: 88.5ms	remaining: 12.6s
7:	learn: 66143.1368093	total: 95.8ms	remaining: 11.9s
8:	learn: 64462.1910657	total: 103ms	remaining: 11.3s
9:	learn: 62954.5907319	total: 110ms	remaining: 10.9s
10:	learn: 61447.3931625	total: 117ms	remaining: 10.5s
11:	learn: 59960.6373165	total: 124ms	remaining: 10.2s
12:	learn: 58561.3781537	total: 133ms	remaining: 10.1s
13:	learn: 57218.5667566	total: 142ms	remaining: 10s
14:	learn: 55961.4354973	total: 150ms	remaining: 9.87s
15:	learn: 54814.8576702	total: 162ms	remaining: 9.95s
16:	learn: 53623.7113940	total: 172ms	remaining: 9.94s
17:	learn: 52543.1012363	total: 180ms	rem

In [66]:
gb.score(X_test, y_test)

0.8975487442502126

# Random Forest

In [67]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf_predicted = rf.predict(X_test)

In [68]:
rf.score(X_test, y_test)

0.8687763948438765

# Final model

In [110]:
X.insert(loc=242, column='MSSubClass_150', value=0)

In [111]:
final_model = CatBoostRegressor(task_type='GPU')
final_model.fit(X, y)

Learning rate set to 0.042447
0:	learn: 77104.2314624	total: 8.2ms	remaining: 8.2s
1:	learn: 74844.5427537	total: 16ms	remaining: 7.98s
2:	learn: 72675.5222637	total: 23.3ms	remaining: 7.75s
3:	learn: 70614.7850410	total: 31.7ms	remaining: 7.9s
4:	learn: 68676.6318754	total: 39.6ms	remaining: 7.88s
5:	learn: 66820.5345651	total: 47ms	remaining: 7.79s
6:	learn: 64937.9672899	total: 54.7ms	remaining: 7.76s
7:	learn: 63250.0044691	total: 62.1ms	remaining: 7.71s
8:	learn: 61586.2660797	total: 69.2ms	remaining: 7.61s
9:	learn: 60086.1541528	total: 76ms	remaining: 7.53s
10:	learn: 58619.3582179	total: 83.1ms	remaining: 7.47s
11:	learn: 57108.1270251	total: 90.5ms	remaining: 7.45s
12:	learn: 55698.7367247	total: 98.3ms	remaining: 7.46s
13:	learn: 54411.5387859	total: 105ms	remaining: 7.42s
14:	learn: 53219.7320119	total: 113ms	remaining: 7.39s
15:	learn: 52079.1450099	total: 120ms	remaining: 7.36s
16:	learn: 50891.5694126	total: 127ms	remaining: 7.32s
17:	learn: 49769.5885331	total: 134ms	rem

<catboost.core.CatBoostRegressor at 0x2b036237790>

# Predict

In [124]:
df_test = pd.read_csv('data/test.csv')
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [126]:
df_test.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64

In [128]:
X_ans = pd.DataFrame()
X_ans['Id'] = df_test['Id']

X_ans

Unnamed: 0,Id
0,1461
1,1462
2,1463
3,1464
4,1465
...,...
1454,2915
1455,2916
1456,2917
1457,2918


In [129]:
df_test = fill_null_with_median(df_test,
                                ['LotFrontage', 'GarageYrBlt', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
                                 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea'])
df_test = fill_null_with_mode(df_test, ['Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                                        'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'KitchenQual', 'Functional',
                                        'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'SaleType',
                                        'MSZoning'])

X_predict = df_test.drop(columns=['Id', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'])

X_predict.isnull().sum()

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
Length: 74, dtype: int64

In [130]:
X_predict = pd.get_dummies(X_predict, drop_first=True)
X_predict.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,20,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,...,0,0,0,0,1,0,0,0,1,0
1,20,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,...,0,0,0,0,1,0,0,0,1,0
2,60,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,...,0,0,0,0,1,0,0,0,1,0
3,60,78.0,9978,6,6,1998,1998,20.0,602.0,0.0,...,0,0,0,0,1,0,0,0,1,0
4,120,43.0,5005,8,5,1992,1992,0.0,263.0,0.0,...,0,0,0,0,1,0,0,0,1,0


In [131]:
X_predict = pd.get_dummies(X_predict, columns=['MSSubClass'], drop_first=True)
X_predict.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_150,MSSubClass_160,MSSubClass_180,MSSubClass_190
0,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,270.0,...,0,0,0,0,0,0,0,0,0,0
1,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,406.0,...,0,0,0,0,0,0,0,0,0,0
2,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,137.0,...,0,0,0,0,0,0,0,0,0,0
3,78.0,9978,6,6,1998,1998,20.0,602.0,0.0,324.0,...,0,0,0,0,0,0,0,0,0,0
4,43.0,5005,8,5,1992,1992,0.0,263.0,0.0,1017.0,...,0,0,0,0,0,1,0,0,0,0


In [132]:
X

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_150,MSSubClass_160,MSSubClass_180,MSSubClass_190
0,65.0,8450,7,5,2003,2003,196.0,706,0,150,...,0,0,0,0,0,0,0,0,0,0
1,80.0,9600,6,8,1976,1976,0.0,978,0,284,...,0,0,0,0,0,0,0,0,0,0
2,68.0,11250,7,5,2001,2002,162.0,486,0,434,...,0,0,0,0,0,0,0,0,0,0
3,60.0,9550,7,5,1915,1970,0.0,216,0,540,...,1,0,0,0,0,0,0,0,0,0
4,84.0,14260,8,5,2000,2000,350.0,655,0,490,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,62.0,7917,6,5,1999,2000,0.0,0,0,953,...,0,0,0,0,0,0,0,0,0,0
1456,85.0,13175,6,6,1978,1988,119.0,790,163,589,...,0,0,0,0,0,0,0,0,0,0
1457,66.0,9042,7,9,1941,2006,0.0,275,0,877,...,1,0,0,0,0,0,0,0,0,0
1458,68.0,9717,5,6,1950,1996,0.0,49,1029,0,...,0,0,0,0,0,0,0,0,0,0


In [134]:
for i, col in enumerate(X.columns):
    if col not in X_predict.columns:
        X_predict.insert(loc=i, column=col, value=0)

In [136]:
assert (X_predict.shape[1] == X.shape[1])

In [137]:
X_ans['SalePrice'] = final_model.predict(X_predict)
X_ans

Unnamed: 0,Id,SalePrice
0,1461,120846.443462
1,1462,157330.119698
2,1463,175423.335022
3,1464,177916.057008
4,1465,191354.109689
...,...,...
1454,2915,82302.501802
1455,2916,89349.259365
1456,2917,174275.575768
1457,2918,122974.907217


In [138]:
X_ans.to_csv('predictions.csv', index=False)