In [1]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score, KFold
from scipy.stats import skew
from scipy.special import boxcox1p
from pycaret.regression import setup, compare_models
import optuna

In [2]:
train_df = pd.read_csv(filepath_or_buffer="/home/gabriel/Documents/house_project/train.csv", sep=",")
test_df = pd.read_csv(filepath_or_buffer="/home/gabriel/Documents/house_project/test.csv", sep=",")

In [3]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
train_df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# Cleaning

## Proper data type

### According to the data description, the MSSubClass feature is categorical, but the info indicates that it is an integer data type. Therefore, there seems to be a discrepancy between the information provided in the data description and the information provided in the info. So:

In [6]:
train_df["MSSubClass"] = train_df["MSSubClass"].astype(dtype=str)

### Test set

In [7]:
test_df["MSSubClass"] = test_df["MSSubClass"].astype(dtype=str)

## Handling missing values (Categorical)

### If you look at the data description, some features have missing values indicated as NA. Therefore, for those features, I will fill the missing values with NA. For all other features, I will fill missing values with the mode.

In [8]:
na_objects = train_df.select_dtypes(include=[object]).loc[:, train_df.isna().sum() != 0].columns

In [9]:
for feature in na_objects:
    mode = train_df[feature].mode()[0]
    if feature not in ['MasVnrType', 'Electrical']:
        train_df[feature] = train_df[feature].fillna("NA")
    else:
        train_df[feature] = train_df[feature].fillna(mode)

### Test Set

In [10]:
na_objects_test = test_df.select_dtypes(include=[object]).loc[:, test_df.isna().sum() != 0].columns

In [11]:
for feature in na_objects_test:
    mode = test_df[feature].mode()[0]
    if feature not in ['MasVnrType', 'Electrical']:
        test_df[feature] = test_df[feature].fillna("NA")
    else:
        test_df[feature] = test_df[feature].fillna(mode)

## Handling missing values (Numerical)

### I will be using KNNImputer to fill missing values in numerical features.

In [12]:
knn = KNNImputer()

In [13]:
na_number = train_df.select_dtypes(include=[np.number]).loc[:, train_df.isna().sum() != 0].columns

In [14]:
train_df[na_number] = pd.DataFrame(
    data=knn.fit_transform(train_df[na_number]),
    columns=na_number
                                  )

In [15]:
train_df.isna().sum().sum()

0

### Test Set

In [16]:
na_number_test = test_df.select_dtypes(include=[np.number]).loc[:, test_df.isna().sum() != 0].columns

In [17]:
test_df[na_number_test] = pd.DataFrame(
    data=knn.fit_transform(test_df[na_number_test]),
    columns=na_number_test
                                  )

In [18]:
test_df.isna().sum().sum()

0

# Feature Engineering

In [19]:
train_df["SqFtPerRoom"] = train_df["GrLivArea"] / (train_df["TotRmsAbvGrd"] +
                                                   train_df["FullBath"] +
                                                   train_df["HalfBath"] +
                                                   train_df["KitchenAbvGr"])

train_df['Total_Home_Quality'] = train_df['OverallQual'] + train_df['OverallCond']

train_df['Total_Bathrooms'] = (train_df['FullBath'] + (0.5 * train_df['HalfBath']) +
                               train_df['BsmtFullBath'] + (0.5 * train_df['BsmtHalfBath']))

train_df["HighQualSF"] = train_df["1stFlrSF"] + train_df["2ndFlrSF"]

### Test Set

In [20]:
test_df["SqFtPerRoom"] = test_df["GrLivArea"] / (test_df["TotRmsAbvGrd"] +
                                                 test_df["FullBath"] +
                                                 test_df["HalfBath"] +
                                                 test_df["KitchenAbvGr"])

test_df['Total_Home_Quality'] = test_df['OverallQual'] + test_df['OverallCond']

test_df['Total_Bathrooms'] = (test_df['FullBath'] + (0.5 * test_df['HalfBath']) +
                              test_df['BsmtFullBath'] + (0.5 * test_df['BsmtHalfBath']))

test_df["HighQualSF"] = test_df["1stFlrSF"] + test_df["2ndFlrSF"]

# Feature Transformations

### Log Transformation

In [21]:
train_df = train_df.drop('Id', axis=1)

In [22]:
skewness = train_df.select_dtypes(include=[np.number]).apply(lambda x: skew(x)).sort_values(ascending=False)
skew_df = pd.DataFrame({
    'Skewness': skewness
})
skew_df

Unnamed: 0,Skewness
MiscVal,24.45164
PoolArea,14.813135
LotArea,12.195142
3SsnPorch,10.293752
LowQualFinSF,9.00208
KitchenAbvGr,4.483784
BsmtFinSF2,4.250888
ScreenPorch,4.117977
BsmtHalfBath,4.099186
EnclosedPorch,3.086696


In [23]:
skew_05 = skew_df[np.abs(skew_df["Skewness"]) > 0.5].index

In [24]:
train_df[skew_05] = np.log1p(train_df[skew_05])

### Test Set

In [25]:
test_Id = test_df["Id"]
test_df = test_df.drop('Id', axis=1)

In [26]:
skewness_test = test_df.select_dtypes(include=[np.number]).apply(lambda x: skew(x)).sort_values(ascending=False)
skew_df_test = pd.DataFrame({
    'Skewness': skewness_test
})
skew_df_test

Unnamed: 0,Skewness
PoolArea,20.176117
MiscVal,20.054543
LowQualFinSF,16.150628
3SsnPorch,12.511336
EnclosedPorch,4.664371
KitchenAbvGr,4.07486
BsmtFinSF2,4.038796
ScreenPorch,3.784349
BsmtHalfBath,3.779085
LotArea,3.112013


In [27]:
skew_05_test = skew_df_test[np.abs(skew_df_test["Skewness"]) > 0.5].index

In [28]:
test_df[skew_05_test] = np.log1p(test_df[skew_05_test])

In [29]:
np.log1p(train_df[skew_05])

Unnamed: 0,MiscVal,PoolArea,LotArea,3SsnPorch,LowQualFinSF,KitchenAbvGr,BsmtFinSF2,ScreenPorch,BsmtHalfBath,EnclosedPorch,...,BsmtUnfSF,2ndFlrSF,OverallCond,TotRmsAbvGrd,HalfBath,Fireplaces,BsmtFullBath,YearRemodAdd,GarageYrBlt,YearBuilt
0,0.000000,0.0,2.306780,0.0,0.0,0.526589,0.000000,0.0,0.000000,0.000000,...,1.794635,2.047835,1.026672,1.162283,0.526589,0.000000,0.526589,2.152099,2.152099,2.152099
1,0.000000,0.0,2.319405,0.0,0.0,0.526589,0.000000,0.0,0.526589,0.000000,...,1.894991,0.000000,1.162283,1.080418,0.000000,0.526589,0.000000,2.150521,2.150521,2.150521
2,0.000000,0.0,2.334879,0.0,0.0,0.526589,0.000000,0.0,0.000000,0.000000,...,1.956616,2.049631,1.026672,1.080418,0.526589,0.526589,0.526589,2.152041,2.151983,2.151983
3,0.000000,0.0,2.318892,0.0,0.0,0.526589,0.000000,0.0,0.000000,1.888504,...,1.986972,2.032004,1.026672,1.124748,0.000000,0.526589,0.526589,2.150167,2.151809,2.146866
4,0.000000,0.0,2.357574,0.0,0.0,0.526589,0.000000,0.0,0.000000,0.000000,...,1.973587,2.074473,1.026672,1.194706,0.526589,0.526589,0.526589,2.151925,2.151925,2.151925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.000000,0.0,2.300272,0.0,0.0,0.526589,0.000000,0.0,0.000000,0.000000,...,2.061871,2.020741,1.026672,1.124748,0.526589,0.526589,0.000000,2.151925,2.151867,2.151867
1456,0.000000,0.0,2.350056,0.0,0.0,0.526589,1.808267,0.0,0.000000,0.000000,...,1.998790,0.000000,1.080418,1.124748,0.000000,0.741276,0.526589,2.151226,2.150639,2.150639
1457,2.177526,0.0,2.313500,0.0,0.0,0.526589,0.000000,0.0,0.000000,0.000000,...,2.051254,2.085687,1.194706,1.194706,0.000000,0.741276,0.000000,2.152273,2.148440,2.148440
1458,0.000000,0.0,2.320595,0.0,0.0,0.526589,2.071575,0.0,0.000000,1.745260,...,0.000000,0.000000,1.080418,1.026672,0.000000,0.000000,0.526589,2.151693,2.148979,2.148979


### Cyclical Feature

In [30]:
train_df["MoSold"] = - np.cos(0.5236 * train_df["MoSold"])

### Test Set

In [31]:
test_df["MoSold"] = np.cos(0.5236 * test_df["MoSold"])

# Handling Categorial Features

### Ordinal Encoder

In [32]:
ordinal_enc = OrdinalEncoder()
object_features = train_df.select_dtypes(include=[object]).columns
train_df[object_features] = pd.DataFrame(
    ordinal_enc.fit_transform(train_df[object_features]),
    index=train_df.index,
    columns=object_features)

### Test Set

In [33]:
object_features_test = test_df.select_dtypes(include=[object]).columns
test_df[object_features_test] = pd.DataFrame(
    ordinal_enc.fit_transform(test_df[object_features_test]),
    index=test_df.index,
    columns=object_features_test)

# Scaling

In [34]:
standard = StandardScaler()
col_scale = train_df.columns[train_df.columns != "SalePrice"]
train_df[col_scale] = pd.DataFrame(
    standard.fit_transform(train_df[col_scale]),
    index=train_df.index,
    columns=col_scale)

### Test Set

In [35]:
col_scale_test = test_df.columns
test_df[col_scale_test] = pd.DataFrame(
    standard.fit_transform(test_df[col_scale_test]),
    index=test_df.index,
    columns=col_scale_test)

# Model Selection

In [36]:
# _ = setup(data=train_df, target="SalePrice")

In [37]:
# compare_models()

# Hyperparameter Optimization

In [38]:
# def objective(trial):
#     learning_rate = trial.suggest_loguniform(name="learning_rate", low=1e-3, high=10.0)
#     n_estimators = trial.suggest_int(name="n_estimators", low=10, high=500)
#     tol = trial.suggest_loguniform(name="tol", low=1e-7, high=10.0)
#     max_depth = trial.suggest_int(name="max_depth", low=1, high=100)
#     max_leaf_nodes = trial.suggest_int(name="max_leaf_nodes", low=2, high=100)
#     min_samples_leaf = trial.suggest_int(name="min_samples_leaf", low=1, high=100)
#     min_samples_split = trial.suggest_int(name="min_samples_split", low=2, high=100)
    
#     model = GradientBoostingRegressor(
#         learning_rate=learning_rate,
#         n_estimators=n_estimators,
#         tol=tol,
#         max_depth=max_depth,
#         max_leaf_nodes=max_leaf_nodes,
#         min_samples_leaf=min_samples_leaf,
#         min_samples_split=min_samples_split
#     )
    
#     model.fit(X=train_df.drop("SalePrice", inplace=False, axis=1), y=train_df["SalePrice"])
#     cv_scores = np.exp(np.sqrt(-cross_val_score(
#         estimator=model,
#         X=train_df.drop("SalePrice", inplace=False, axis=1),
#         y=train_df["SalePrice"],
#         scoring="neg_mean_squared_error",
#         cv=KFold(n_splits=10))))
    
#     return np.mean(cv_scores)

In [39]:
# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=100)

### Gradient Boosting Regressor

In [40]:
#best hyperparameters
gradient_boosting_regressor_params  = {'learning_rate': 0.1463418400663449,
                                       'n_estimators': 396,
                                       'tol': 4.765131744581483,
                                       'max_depth': 65,
                                       'max_leaf_nodes': 4,
                                       'min_samples_leaf': 8,
                                       'min_samples_split': 16}

In [41]:
gradient_regressor = GradientBoostingRegressor(**gradient_boosting_regressor_params, verbose=0)
gradient_regressor.fit(X=train_df.drop("SalePrice", axis=1), y=train_df["SalePrice"])

In [42]:
#score on the training set
gradient_regressor.score(train_df.drop("SalePrice", axis=1), train_df["SalePrice"])

0.9763884006113305

In [43]:
predictions = pd.DataFrame({"Id": test_Id,
                            "SalePrice": np.exp(gradient_regressor.predict(test_df)) - 1})
predictions

Unnamed: 0,Id,SalePrice
0,1461,124747.536661
1,1462,158914.235161
2,1463,180034.849163
3,1464,190794.334649
4,1465,175109.545622
...,...,...
1454,2915,85290.827766
1455,2916,76675.518641
1456,2917,160499.777509
1457,2918,119775.529790


In [44]:
# Submission

predictions.to_csv("/home/gabriel/Documents/house_project/submission.csv", index=False)