In [1]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split

<br>
<br>
<br>

### Data collection and inspection

In [2]:
# importing preprocessed training data
X_train = pd.read_csv("./data/x_train_preprocessed.csv")
X_train.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedIncCat,AveOccupCat,LatitudeCat,LongitudeCat,AveHouses,RatioBedsRms,AvePeopBeds
0,-0.538619,-0.555556,-0.827041,0.839954,-0.113938,-0.352808,0.108753,-0.441799,0.0,0.0,0.5,-0.5,0.055483,1.234035,-0.482183
1,0.096882,-0.444444,0.62939,-0.004795,0.13385,1.461727,1.193634,-0.732804,0.0,0.0,1.0,-0.5,-0.281656,-0.49729,1.297043
2,0.600458,0.111111,0.977295,0.709701,-0.356195,-0.383441,0.137931,-0.529101,0.0,0.0,0.5,-0.5,-0.180543,-0.595182,-0.485019
3,-0.525317,-0.5,-1.144784,-1.300533,-0.141593,-0.896998,-0.405836,0.359788,0.0,0.0,-0.5,0.0,0.296617,1.197841,-0.575233
4,0.111757,1.5,-0.52243,0.214548,0.45354,-1.559947,0.95756,-1.047619,0.0,0.0,1.0,-1.0,1.745007,0.618013,-1.429912


In [3]:
# checking shape and null values
print(X_train.shape)
print(X_train.isna().any().any())

(37137, 15)
False


<br>

In [4]:
# importing preprocessed test data
X_test = pd.read_csv("./data/x_test_preprocessed.csv")
X_test.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedIncCat,AveOccupCat,LatitudeCat,LongitudeCat,AveHouses,RatioBedsRms,AvePeopBeds
0,-0.862687,0.555556,-0.068113,0.612862,-0.071903,0.136263,1.474801,-0.899471,0.0,0.0,1.5,-1.0,-0.088386,0.185828,-0.021245
1,-1.014303,-0.166667,-0.587296,0.637519,1.009956,0.595758,-0.06366,0.042328,0.0,0.0,0.0,0.0,0.63665,0.809149,0.367648
2,2.004434,-0.222222,1.3729,-1.390376,0.168142,0.197123,-0.153846,0.169312,0.0,0.0,0.0,0.0,0.102214,-1.10951,0.516822
3,0.55545,1.333333,-0.199133,-0.088449,-0.747788,-1.370024,-0.005305,0.029101,0.0,0.0,0.0,0.0,-0.250664,0.171246,-1.221021
4,-0.18461,0.0,-0.868247,0.395314,3.686947,-1.013934,-0.005305,0.042328,0.0,0.0,0.0,0.0,5.211012,1.184199,-0.976649


In [5]:
# checking shape and null values
print(X_test.shape)
print(X_test.isna().any().any())

(24759, 15)
False


<br>

In [6]:
# importing preprocessed target
y = pd.read_csv("./data/y_preprocessed.csv")
y.head()

Unnamed: 0,MedHouseVal
0,0.98
1,0.946
2,1.576
3,1.336
4,4.5


In [7]:
# checking shape and null values
print(y.shape)
print(y.isna().any().any())

(37137, 1)
False


<br>

creating validation set

In [8]:
x_train, x_val, y_train, y_val = train_test_split(X_train, y, test_size=0.1, shuffle=True, random_state=42)

In [9]:
print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)

(33423, 15)
(3714, 15)
(33423, 1)
(3714, 1)


<br>
<br>
<br>

### Model Selection

In [100]:
from lightgbm import LGBMRegressor

from sklearn.model_selection import GridSearchCV

In [102]:
# light gradient boosting machine regressor
regressor = LGBMRegressor(random_state=42, n_jobs=-1)

In [103]:
# parameters for grid search
parameters = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [8, 10, 12],
    'learning_rate': [0.05, 0.1, 0.15]
}

In [104]:
# grid search
grid_search = GridSearchCV(estimator=regressor, param_grid=parameters, scoring="neg_mean_squared_error", n_jobs=-1, cv=5)

In [105]:
# finding best set of parameters
grid_search.fit(x_train, np.ravel(y_train))

In [106]:
# best rmse score
np.sqrt(-grid_search.best_score_)

0.5653126711559375

In [107]:
# best set of parameters
grid_search.best_params_

{'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 400}

<br>
<br>
<br>

### Modelling

In [109]:
# extracting best estimator
best_estimator = grid_search.best_estimator_

In [112]:
# training with full data
best_estimator.fit(X_train, np.ravel(y))

In [113]:
# making predictions on test data
predictions = best_estimator.predict(X_test)

In [120]:
# inspecting top 10 predictions
predictions[:10]

array([0.71761865, 1.01103629, 3.98384589, 3.43360796, 2.52148774,
       3.15696417, 1.40190591, 1.91462287, 0.85577604, 0.72102273])

In [115]:
# checking shape
predictions.shape

(24759,)

<br>
<br>
<br>

### Making submission

In [116]:
# creating submission dataframe
submission_df = pd.read_csv("./data/sample_submission.csv")
submission_df.head()

Unnamed: 0,id,MedHouseVal
0,37137,2.079751
1,37138,2.079751
2,37139,2.079751
3,37140,2.079751
4,37141,2.079751


In [117]:
# verifying shape
submission_df.shape

(24759, 2)

In [118]:
# updating dummy values with actual preditions
submission_df.MedHouseVal = predictions

In [119]:
# inspecting final df
submission_df.head()

Unnamed: 0,id,MedHouseVal
0,37137,0.717619
1,37138,1.011036
2,37139,3.983846
3,37140,3.433608
4,37141,2.521488


In [121]:
# saving as csv file
submission_df.to_csv("./data/submission_1.csv", index=None)