In [None]:
pip install lazypredict

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import missingno
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, PowerTransformer, RobustScaler
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import Ridge,Lasso, LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import ExtraTreesRegressor, StackingRegressor, GradientBoostingRegressor

#from category_encoders import CatBoostEncoder
from lazypredict.Supervised import LazyRegressor

import optuna
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm



# Display All Columns Dataset

In [None]:
pd.set_option('display.max_columns', None)

# Import Dataset

In [None]:
train_dataset = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv')
train_dataset.head()


In [None]:
print(f"The shape of the training dataset is: {train_dataset.shape}")

In [None]:
test_dataset = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')

test_Id = test_dataset['Id']

test_dataset.head()

In [None]:
print(f"The shape of the test dataset is: {test_dataset.shape}")

In [None]:
# Combining the train and test sets
combined_data = pd.concat((train_dataset, test_dataset), axis = 0)
combined_data.shape

# EDA

In [None]:
train_dataset.info()

In [None]:
test_dataset.info()

## Check Null Values

In [None]:
train_null = train_dataset.isna().sum()
train_null.drop(labels='SalePrice', axis =0, inplace=True)

test_null = test_dataset.isna().sum()

#Compare the null values in both sets
train_null.compare(test_null).sort_values(by='self', ascending=False)

## Sorting Missing Values By Percentage

In [None]:
missing_value = pd.DataFrame(data=train_dataset.isna().sum(), index=train_dataset.columns, columns=['Missing'])
missing_value['Percentage'] = (missing_value['Missing']/1460) * 100
missing_value = missing_value.sort_values(by='Percentage', ascending=False)

missing_value.head(20)

## Visualize Missing Values

In [None]:
missingno.heatmap(df=train_dataset),missingno.heatmap(df=test_dataset)

## CHecking Distribution 

In [None]:
numerical_columns = train_dataset.select_dtypes(include=['int64', 'float']).columns.tolist()
categorical_columns = train_dataset.select_dtypes(include='object').columns.tolist()

numerical_columns = [col for col in numerical_columns if col != 'Id' and col != 'SalePrice']
discrete_columns = [col for col in numerical_columns if len(train_dataset[col].unique()) < 25]
continuous_columns = [col for col in numerical_columns if col not in discrete_columns]

print(f"Number of categorical features: {len(categorical_columns)}")
print(f"Number of numerical features:   {len(numerical_columns)}")
print(f"Number of discrete features:    {len(discrete_columns)}")
print(f"Number of continuous features:  {len(continuous_columns)}")

## The Distribution(Categorical Column)

In [None]:
fig, axes = plt.subplots(nrows=9, ncols=5, figsize=(30,30))
for i, feature in enumerate(categorical_columns):
    sns.histplot(data=train_dataset, x=feature, ax=axes[i%9, i//9], color = 'red')
    sns.histplot(data=test_dataset, x=feature, ax=axes[i%9, i//9], color = 'blue')

### Continuous column

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=5, figsize=(25,15))
for i, feature in enumerate(continuous_columns):
    sns.histplot(data=train_dataset, x=feature, ax=axes[i%4, i//4], color = 'darkblue')
    sns.histplot(data=test_dataset, x=feature, ax=axes[i%4, i//4], color = 'gold')

## Discrete Column

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=5, figsize=(25,15))
for i, feature in enumerate(discrete_columns):
    sns.histplot(data=train_dataset, x=feature, ax=axes[i%4, i//4], color = 'darkblue')
    sns.histplot(data=test_dataset, x=feature, ax=axes[i%4, i//4], color = 'gold')

## Checking Skewness

In [None]:
skew_trainset = train_dataset[numerical_columns].skew().sort_values(ascending=False)
skew_testset = test_dataset[numerical_columns].skew().sort_values(ascending=False)

avg_skewness = (skew_trainset + skew_testset)/2
avg_skewness = avg_skewness.sort_values(ascending=False)

print(avg_skewness)

## Label Distribution

In [None]:
sns.histplot(data=train_dataset, x='SalePrice')

## Corelation Comparison

In [None]:
numeric_data = pd.DataFrame()

for feature in numerical_columns:
    numeric_data[feature] = train_dataset[feature]

corr_data = numeric_data.corr(method='pearson')

plt.figure(figsize=(30,30))
sns.heatmap(data= corr_data, cmap='coolwarm', annot=True, fmt='.2g')

In [None]:
# CORRELATION WITH SALEPRICE

numeric_data['SalePrice'] = train_dataset['SalePrice'] 
corr_data = numeric_data.corr(method='pearson')
corr_data = corr_data[['SalePrice']]     # ONLY SHOWS CORRELATION FOR SALEPRICE FEATURE


plt.figure(figsize=(7,10))
sns.heatmap(data=corr_data, cmap='coolwarm', annot=True, fmt='.2g')

In [None]:
# LINEARITY USING SCATTER PLOT

fig , axes = plt.subplots(nrows=7, ncols=6, figsize=(37,25))

for i , feature in enumerate(numerical_columns):
    sns.regplot(data= train_dataset, x= feature, y= 'SalePrice', ax= axes[i%7, i//7])

plt.show()

In [None]:
# RELATION BETWEEN YRSOLD AND SALEPRICE
combined_data.groupby('YrSold')['SalePrice'].median().plot()
plt.xlabel('Year Sold')
plt.ylabel('House Price')
plt.title('House price vs YearSold')

In [None]:
# FIRST DROP COLUMNS WITH MANY NULL VALUES
cols_with_many_null = ['PoolQC', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage']
combined_data.drop(labels=cols_with_many_null, axis=1, inplace=True)

# NEXT DROP COLUMNS WITH MANY ZERO VALUES
cols_with_many_zero = ['LowQualFinSF', 'MiscVal', '3SsnPorch', 'PoolArea']
combined_data.drop(labels=cols_with_many_zero, axis=1, inplace=True)

# LAST DROP COLUMNS WITH DOMINANT 1 LABEL
cols_with_dominant_label = ['Id','RoofMatl', 'Street', 'Condition2', 'Utilities', 'Heating']
combined_data.drop(labels=cols_with_dominant_label, axis=1, inplace=True)

combined_data.columns , len(combined_data.columns)  

In [None]:
# FIND ALL DATE FEATURE
year_feature = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']

for feature in year_feature:
    combined_data[feature] = combined_data['YrSold'] - combined_data[feature]

combined_data[year_feature].head(5) 

In [None]:
# FILL NUMERICAL MISSING VALUES WITH ZERO VALUES
numerical_columns = combined_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
for feature in numerical_columns:
    combined_data[feature] = combined_data[feature].fillna(0)

# FILL CONTINUOUS MISSING VALUES 
dropped_cols = cols_with_dominant_label + cols_with_many_null
mode_feature = ['Electrical', 'MsZoning', 'Functional', 'KitchenQual', 'Exterior2nd', 'Exterior1st', 'Utilities', 'SaleType']


for feature in categorical_columns:
    if feature not in dropped_cols:
        if feature not in mode_feature:
            combined_data[feature] = combined_data[feature].fillna('Unknown')
        else:
            combined_data[feature] = combined_data[feature].fillna(combined_data[feature].mode()[0])

combined_data.info()

In [None]:
# ADD NEW FEATURE 'GarageEfficiency'
combined_data['GarageEfficiency'] = combined_data['GarageArea'] / (combined_data['GarageCars'] + 1)   # +1 TO AVOID DIVISION BY ZERO

# ADD NEW FEATURE 
#combined_data['LivabilityScore'] = combined_data['GrLivArea'] + (combined_data['FullBath'] * 2) + (combined_data['HalfBath'])

# ADD NEW FEATURE TotalArea
combined_data['TotalArea'] = combined_data['GrLivArea'] + combined_data['TotalBsmtSF']

In [None]:
# CHECK CORRELATION OF NEW FEATURE 

new_feature = combined_data[['SalePrice','GarageEfficiency','GarageArea','GarageCars','TotalArea','GrLivArea','TotalBsmtSF']]

corr_new_feature = new_feature.corr(method='spearman')

sns.heatmap(data=corr_new_feature, cmap='coolwarm', annot=True, fmt='.2g')

## Dealing with Multicolinearity

This is Usually Done By first identifying the columns with that are multicolinear
Then scaling, in our case we use the standardscaler, we use PCA to deal with the features with multicolinearity since it may affecgt the perfomance of the model


In [None]:
# Normalize
scaler = StandardScaler()
multi_colinear_1 = combined_data[['GarageArea', 'GarageCars']]
multi_colinear_1 = scaler.fit_transform(multi_colinear_1)

# Finding The Principle Component
pca_1 = PCA(n_components=None)
pca_1 = pca_1.fit_transform(multi_colinear_1)

multi_colinear_2 = combined_data[['1stFlrSF', 'TotalBsmtSF']]
multi_colinear_2 = scaler.fit_transform(multi_colinear_2)

# Finding The Principle Component
pca_2 = PCA(n_components=None)
pca_2 = pca_2.fit_transform(multi_colinear_2)

multi_colinear_3 = combined_data[['GrLivArea', 'TotRmsAbvGrd']]
multi_colinear_3 = scaler.fit_transform(multi_colinear_3)

# Finding The Principle Component
pca_3 = PCA(n_components=None)
pca_3 = pca_3.fit_transform(multi_colinear_3)


# # Finding The Principle Component
# pca_1 = PCA(n_components=None)
# pca_1 = pca_1.fit_transform(multi_colinear_1)

# pca_2 = PCA(n_components=None)
# pca_2 = pca_2.fit_transform(multi_colinear_2)

# pca_3 = PCA(n_components=None)
# pca_3 = pca_3.fit_transform(multi_colinear_3)

# CHOOSE PC1

pca_1  = PCA(n_components=1)
multi_coliner_1 = pca_1.fit_transform(multi_colinear_1)

pca_2  = PCA(n_components=1)
multi_colinear_2 = pca_2.fit_transform(multi_colinear_2)

pca_3  = PCA(n_components=1)
multi_colinear_3 = pca_3.fit_transform(multi_colinear_3)

# # ADD PC1 TO DATAFRAME
combined_data['multi_colinear_1'] = multi_colinear_1[:,0].ravel()
combined_data['multi_colinear_2'] = multi_colinear_2[:,0].ravel()
combined_data['multi_colinear_3'] = multi_colinear_3[:,0].ravel()



In [None]:
combined_data.head()


In [None]:
# DROP MULTICOLINEARITY FEATURE

multi_corr = ['GarageCars','GarageArea','1stFlrSF', 'TotalBsmtSF','GrLivArea', 'TotRmsAbvGrd']

combined_data.drop(labels= multi_corr, axis=1, inplace=True )

combined_data.head(5)

## Checking Skewness For Transformation

We have to redifine our numerical variables because some of the columns were dropped

In [None]:
# SELECT ALL NUMERICAL DATA
transform_data      = combined_data.select_dtypes(include=['int64', 'float64'])
continuous_feature  = [col for col in transform_data if len(transform_data[col].unique()) > 25]    # SELECT ONLY CONTINUOUS FEATURE
transform_data = combined_data[continuous_feature]
transform_data.drop(labels=['SalePrice'], axis=1 , inplace=True)

# CHECK SKEWNESS
skewness = transform_data.skew().sort_values(ascending=False)
print(skewness)

In [None]:
columns_to_transform = ['LotArea','BsmtFinSF2', 'EnclosedPorch', 'ScreenPorch',
                        'MasVnrArea', 'WoodDeckSF', 'multi_colinear_2',
                        'TotalArea', 'BsmtFinSF1']
for col in columns_to_transform:
    yeo_johnson = PowerTransformer(method='yeo-johnson', standardize=True, copy=True)
    combined_data[[col]] = yeo_johnson.fit_transform(combined_data[[col]]) 

## Visualize the columns after transformation

In [None]:
# VISUALIZE AFTER TRANSFORMATION
fig , axes = plt.subplots(nrows=1, ncols=9, figsize=(14,4))

for i , feature in enumerate(columns_to_transform):
    sns.histplot(data=combined_data, x=feature, ax=axes[i%9])

plt.show()

## One Hot Encoding Categorical Columns

In [None]:
combined_data = pd.get_dummies(combined_data).reset_index(drop=True)

combined_data

In [None]:
new_train_data = combined_data.iloc[:len(train_dataset), :]
new_test_data  = combined_data.iloc[len(train_dataset):, :]

x_train = new_train_data.drop(labels=['SalePrice'], axis=1)

# LOG TRANSFORMATION FOR TARGET FEATURE (SalePrice)
y_train = np.log1p(new_train_data['SalePrice'])

x_test = new_test_data.drop(labels=['SalePrice'], axis=1)


x_train.shape , y_train.shape, x_test.shape

In [None]:
# SELECT ALL NUMERICAL FEATURE
numerical_feature = ['MSSubClass','LotArea','OverallQual','OverallCond','YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','2ndFlrSF','BsmtFullBath',
                     'BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','Fireplaces','GarageYrBlt','WoodDeckSF','OpenPorchSF','EnclosedPorch','ScreenPorch',
                     'MoSold','YrSold','GarageEfficiency','TotalArea','multi_colinear_1','multi_colinear_2','multi_colinear_3']


# VISUALIZE IT 
fig, axes = plt.subplots(nrows=5, ncols=6, figsize=(25,20))
for i , feature in enumerate(numerical_feature):
    sns.histplot(data= x_train , x=feature , ax=axes[i%5, i//5])

# BOXPLOT TO SEE THE OUTLIERS
fig, axes = plt.subplots(nrows=5, ncols=6, figsize=(25,20))
for i,feature in enumerate(numerical_feature):
    sns.boxplot(data=x_train, x=feature, ax=axes[i%5, i//5])

plt.show()

We will perfom robust scaling to features with many outliers and are not normaly distributed.


In [None]:
# CHOOSE COLUMNS TO NORMALIZE
cols_to_robust = ['MSSubClass','YearRemodAdd', '2ndFlrSF','BedroomAbvGr','OpenPorchSF','MasVnrArea','EnclosedPorch','BsmtFinSF1','ScreenPorch','BsmtFinSF2','GarageYrBlt','YearBuilt','WoodDeckSF']
cols_to_zscore = ['GarageEfficiency','LotArea','TotalArea','OverallQual','multi_colinear_1','OverallCond','multi_colinear_2','BsmtUnfSF','multi_colinear_3']

# ROBUST SCALING NORM
robust = RobustScaler()
robust.fit(x_train[cols_to_robust])

x_train[cols_to_robust] = robust.transform(x_train[cols_to_robust])
x_test[cols_to_robust]  = robust.transform(x_test[cols_to_robust])

# ZSCORE NORM
zscore = StandardScaler()
zscore.fit(x_train[cols_to_zscore])

x_train[cols_to_zscore] = zscore.transform(x_train[cols_to_zscore])
x_test[cols_to_zscore]  = zscore.transform(x_test[cols_to_zscore])

x_train.shape , x_test.shape

# Feature Selection

# Defining Hyperparameters

In [None]:
def objective(trial):
    xgb_params = {
        "learning_rate": trial.suggest_float("xgb_learning_rate",0.0001,0.1, log=True),
        "max_depth": trial.suggest_int("xgb_max_depth",3,12),
        "subsample": trial.suggest_float("xgb_subsample",0.5,1.0),
        "colsample_bytree": trial.suggest_float("xgb_colsample_bytree",0.5,1.0),
        "n_estimators": trial.suggest_int("xgb_n_estimators",50,300),
    }

    cat_params = {
        "learning_rate": trial.suggest_float("cat_learning_rate",0.0001,0.1, log=True),
        "depth": trial.suggest_int("cat_depth",3,10),
        "iterations": trial.suggest_int("cat_iterations",100,500),
        "l2_leaf_reg": trial.suggest_float("cat_l2_leaf_reg",0.0001,0.1, log=True),
        "subsample": trial.suggest_float("cat_subsample",0.5,1.0),
        "random_strength": trial.suggest_float("cat_random_strength",0.0001,0.1),
        
    }

    xgb = XGBRegressor(**xgb_params, objective='reg:squarederror')
    cat = CatBoostRegressor(**cat_params, loss_function='RMSE', verbose=0)


    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    xgb_score = cross_val_score(xgb, x_train, y_train, cv=kf, scoring='neg_mean_squared_error').mean()
    cat_score = cross_val_score(cat, x_train, y_train, cv=kf, scoring='neg_mean_squared_error').mean()
    
    
    return min(xgb_score, cat_score)

# Running Optuna

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

best_params = study.best_params
print(best_params)

# Training The Model

In [None]:
best_xgb_params = {
        "learning_rate": study.best_params['xgb_learning_rate'],
        "max_depth": study.best_params['xgb_max_depth'],
        "subsample": study.best_params['xgb_subsample'],
        "colsample_bytree": study.best_params['xgb_colsample_bytree'],
        "n_estimators": study.best_params['xgb_n_estimators'],
}
best_cat_params = {
        "learning_rate": study.best_params['cat_learning_rate'],
        "depth": study.best_params['cat_depth'],
        "iterations": study.best_params['cat_iterations'],
        "l2_leaf_reg": study.best_params['cat_l2_leaf_reg'],
        "subsample": study.best_params['cat_subsample'],
        "random_strength": study.best_params['cat_random_strength'],
}  

xgb_2 = XGBRegressor(**best_xgb_params, objective='reg:squarederror')
cat_2 = CatBoostRegressor(**best_cat_params, loss_function='RMSE', verbose=0)


voting_regressor = VotingRegressor(estimators=[('cat',cat_2),('xgb', xgb_2)])

pipeline = Pipeline([
    ('voting_regressor', voting_regressor)
])

pipeline.fit(x_train, y_train)

# print(f"The Model Accuracy is {test_score}")


# Making Predictions

In [None]:
y_pred = pipeline.predict(x_test)


# mse = mean_squared_error(y_test, y_pred)
# mae = mean_absolute_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)
# print(f"Mean Squared Error is: {mse}")
# print(f"Mean Average Error is: {mae}")
# print(f"r2 score: {r2}")
result = pd.DataFrame()
result['Id'] = test_Id
result['SalePrice'] = y_pred
result.to_csv('submission.csv', index=False)