In [1]:
#!pip install catboost

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
data = pd.read_csv("C:/Users/Ashraf/Documents/Extra-folder/housing_in_london_monthly_variables.csv")

In [4]:
data

Unnamed: 0,date,area,average_price,code,houses_sold,no_of_crimes,borough_flag
0,1995-01-01,city of london,91449,E09000001,17.0,,1
1,1995-02-01,city of london,82203,E09000001,7.0,,1
2,1995-03-01,city of london,79121,E09000001,14.0,,1
3,1995-04-01,city of london,77101,E09000001,7.0,,1
4,1995-05-01,city of london,84409,E09000001,10.0,,1
...,...,...,...,...,...,...,...
13544,2019-09-01,england,249942,E92000001,64605.0,,0
13545,2019-10-01,england,249376,E92000001,68677.0,,0
13546,2019-11-01,england,248515,E92000001,67814.0,,0
13547,2019-12-01,england,250410,E92000001,,,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13549 entries, 0 to 13548
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           13549 non-null  object 
 1   area           13549 non-null  object 
 2   average_price  13549 non-null  int64  
 3   code           13549 non-null  object 
 4   houses_sold    13455 non-null  float64
 5   no_of_crimes   7439 non-null   float64
 6   borough_flag   13549 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 741.1+ KB


In [6]:
data.isna().sum()

date                0
area                0
average_price       0
code                0
houses_sold        94
no_of_crimes     6110
borough_flag        0
dtype: int64

In [7]:
data.isna().mean()

date             0.000000
area             0.000000
average_price    0.000000
code             0.000000
houses_sold      0.006938
no_of_crimes     0.450956
borough_flag     0.000000
dtype: float64

## Preprocessing

In [8]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop redundant columns
    df = df.drop('code', axis =1)
    
    # drop column with too many missing values
    df = df.drop('no_of_crimes', axis=1)
    
    # Drop rows with missing target values
    missing_target_rows = df[df['houses_sold'].isna()].index
    df = df.drop(missing_target_rows, axis=0).reset_index(drop=True)
    
    # Extract date features
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].apply(lambda x: x.year)
    df['month'] = df['date'].apply(lambda x: x.month)
    df = df.drop('date', axis=1)
    
    # One-hot encode the area column
    area_dummies = pd.get_dummies(df['area'], prefix='area')
    df = pd.concat([df,area_dummies],axis=1)
    df = df.drop('area', axis=1)
    
    # split df into X and y
    y = df['houses_sold']
    X = df.drop('houses_sold', axis=1)
    
    # Train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7, shuffle=True, random_state=42)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index = X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [9]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [10]:
#X[X['houses_sold'].isna()].index

In [11]:
X_train

Unnamed: 0,average_price,borough_flag,year,month,area_barking and dagenham,area_barnet,area_bexley,area_brent,area_bromley,area_camden,...,area_south east,area_south west,area_southwark,area_sutton,area_tower hamlets,area_waltham forest,area_wandsworth,area_west midlands,area_westminster,area_yorks and the humber
2185,-0.530151,0.607695,-0.699471,0.734296,-0.149540,-0.153210,-0.149168,-0.148796,-0.148423,-0.153572,...,-0.155374,-0.15028,-0.146921,-0.148049,-0.146165,-0.152117,-0.150649,-0.15028,-0.14954,-0.14991
381,-0.892681,0.607695,-0.838260,1.314999,6.687185,-0.153210,-0.149168,-0.148796,-0.148423,-0.153572,...,-0.155374,-0.15028,-0.146921,-0.148049,-0.146165,-0.152117,-0.150649,-0.15028,-0.14954,-0.14991
1890,0.314365,0.607695,-0.560682,-1.588518,-0.149540,-0.153210,-0.149168,-0.148796,-0.148423,6.511599,...,-0.155374,-0.15028,-0.146921,-0.148049,-0.146165,-0.152117,-0.150649,-0.15028,-0.14954,-0.14991
1155,0.329653,0.607695,1.243575,0.153592,-0.149540,-0.153210,6.703839,-0.148796,-0.148423,-0.153572,...,-0.155374,-0.15028,-0.146921,-0.148049,-0.146165,-0.152117,-0.150649,-0.15028,-0.14954,-0.14991
4905,-0.219596,0.607695,-0.421893,1.605351,-0.149540,-0.153210,-0.149168,-0.148796,-0.148423,-0.153572,...,-0.155374,-0.15028,-0.146921,-0.148049,-0.146165,-0.152117,-0.150649,-0.15028,-0.14954,-0.14991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5191,-0.323344,0.607695,-0.560682,1.314999,-0.149540,-0.153210,-0.149168,-0.148796,-0.148423,-0.153572,...,-0.155374,-0.15028,-0.146921,-0.148049,-0.146165,-0.152117,-0.150649,-0.15028,-0.14954,-0.14991
13418,-0.172826,-1.645562,1.243575,1.314999,-0.149540,-0.153210,-0.149168,-0.148796,-0.148423,-0.153572,...,-0.155374,-0.15028,-0.146921,-0.148049,-0.146165,-0.152117,-0.150649,-0.15028,-0.14954,-0.14991
5390,-0.876391,0.607695,-1.670994,0.153592,-0.149540,-0.153210,-0.149168,-0.148796,-0.148423,-0.153572,...,-0.155374,-0.15028,-0.146921,-0.148049,-0.146165,-0.152117,-0.150649,-0.15028,-0.14954,-0.14991
860,1.400585,0.607695,1.243575,1.314999,-0.149540,6.527009,-0.149168,-0.148796,-0.148423,-0.153572,...,-0.155374,-0.15028,-0.146921,-0.148049,-0.146165,-0.152117,-0.150649,-0.15028,-0.14954,-0.14991


In [12]:
y_train

2185       694.0
381        282.0
1890       265.0
1155       303.0
4905       349.0
          ...   
5191       478.0
13418    74097.0
5390       186.0
860        376.0
7270       435.0
Name: houses_sold, Length: 9418, dtype: float64

## Training

In [13]:
models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decisoin Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor(),
    "                               XGBoost": XGBRegressor(),
    "                              LightGBM": LGBMRegressor(),
    "                              CatBoost": CatBoostRegressor(verbose=0)
    
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + "trained.")

                     Linear Regressiontrained.
 Linear Regression (L2 Regularization)trained.
 Linear Regression (L1 Regularization)trained.
                   K-Nearest Neighborstrained.
                        Neural Networktrained.
Support Vector Machine (Linear Kernel)trained.
   Support Vector Machine (RBF Kernel)trained.
                         Decisoin Treetrained.
                         Random Foresttrained.
                     Gradient Boostingtrained.
                               XGBoosttrained.
                              LightGBMtrained.
                              CatBoosttrained.


## Results

In [15]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    rmse =np.sqrt(np.mean((y_test - y_pred)**2))
    print(name + "RMSE:{:.4f}".format(rmse))


                     Linear RegressionRMSE:3392.0044
 Linear Regression (L2 Regularization)RMSE:3392.1777
 Linear Regression (L1 Regularization)RMSE:3392.1794
                   K-Nearest NeighborsRMSE:1830.6562
                        Neural NetworkRMSE:3472.7495
Support Vector Machine (Linear Kernel)RMSE:10421.1116
   Support Vector Machine (RBF Kernel)RMSE:12499.4230
                         Decisoin TreeRMSE:1891.4476
                         Random ForestRMSE:1314.5273
                     Gradient BoostingRMSE:1505.7169
                               XGBoostRMSE:1452.2522
                              LightGBMRMSE:1240.3776
                              CatBoostRMSE:1248.5564


In [16]:
for name, model in models.items():
    print(name + "R^2:{:.2f}".format(model.score(X_test, y_test)))

                     Linear RegressionR^2:0.92
 Linear Regression (L2 Regularization)R^2:0.92
 Linear Regression (L1 Regularization)R^2:0.92
                   K-Nearest NeighborsR^2:0.98
                        Neural NetworkR^2:0.92
Support Vector Machine (Linear Kernel)R^2:0.26
   Support Vector Machine (RBF Kernel)R^2:-0.06
                         Decisoin TreeR^2:0.98
                         Random ForestR^2:0.99
                     Gradient BoostingR^2:0.98
                               XGBoostR^2:0.99
                              LightGBMR^2:0.99
                              CatBoostR^2:0.99
