# Merging new dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier, MLPRegressor

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor

import numbers
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer

from sklearn import set_config
set_config(transform_output="pandas")

In [2]:
df = pd.read_csv("flood_tool/resources/postcodes_labelled.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,postcode,easting,northing,soilType,elevation,localAuthority,riskLabel,medianPrice,historicallyFlooded
0,0,OL9 7NS,390978,403269,Unsurveyed/Urban,130,Oldham,1,119100.0,False
1,1,WV13 2LR,396607,298083,Unsurveyed/Urban,130,Walsall,1,84200.0,False
2,2,LS12 1LZ,427859,432937,Unsurveyed/Urban,60,Leeds,1,134900.0,False
3,3,SK15 1TS,395560,397900,Unsurveyed/Urban,120,Tameside,1,170200.0,False
4,4,TS17 9NN,445771,515362,Unsurveyed/Urban,20,Stockton-on-Tees,1,190600.0,False


In [4]:
df.duplicated().sum()

0

In [5]:
X = df.drop(columns = 'riskLabel')

In [6]:
y = df['riskLabel']

In [7]:
num_cols = X.select_dtypes(include = np.number).columns

In [8]:
cat_cols = X.select_dtypes(exclude = np.number).columns

In [9]:
cat_cols = cat_cols.drop(['localAuthority', 'historicallyFlooded', 'postcode'])

In [10]:
num_cols = num_cols.drop('medianPrice') #We will keep same features as unlabelled data

In [11]:
num_pipe = make_pipeline(SimpleImputer(), StandardScaler())

cat_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), 
                         OneHotEncoder(handle_unknown='ignore', sparse_output = False))

preproc_pipe = ColumnTransformer([('num_cols', num_pipe, num_cols),
                                 ('cat_cols', cat_pipe, cat_cols)])

In [12]:
SCORES = np.array(
    [
        [100, 80, 60, 60, 30, 0, -30, -600, -1800, -2400],
        [80, 100, 80, 90, 60, 30, 0, -300, -1200, -1800],
        [60, 80, 100, 120, 90, 60, 30, 0, -600, -1200],
        [40, 60, 80, 150, 120, 90, 60, 300, 0, -600],
        [20, 40, 60, 120, 150, 120, 90, 600, 600, 0],
        [0, 20, 40, 90, 120, 150, 120, 900, 1200, 600],
        [-20, 0, 20, 60, 90, 120, 150, 1200, 1800, 1200],
        [-40, -20, 0, 30, 60, 90, 120, 1500, 2400, 1800],
        [-60, -40, -20, 0, 30, 60, 90, 1200, 3000, 2400],
        [-80, -60, -40, -30, 0, 30, 60, 900, 2400, 3000],
    ]
)

In [13]:
def score_fn(y_true: np.ndarray, y_pred: np.ndarray) -> numbers.Real:
     # Adjust for zero-based indexing
    y_pred_adjusted = np.round(y_pred, 0).astype(int) - 1
    y_true_adjusted = y_true - 1

    # Calculate the score
    score = sum(
        SCORES[pred, true] for pred, true in zip(y_pred_adjusted, y_true_adjusted)
    )
    return score

In [14]:
custom_scorer = make_scorer(score_fn, greater_is_better=True)

## Random Forest SMOTED

In [15]:
rf_smote_pipeline = ImbPipeline([
    ('preprocessor', preproc_pipe),
    ('smote', SMOTE()),
    ('classifier', RandomForestClassifier())])

In [16]:
rf_smote_pipeline.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('num_cols',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    Index(['Unnamed: 0', 'easting', 'northing', 'elevation'], dtype='object')),
                                   ('cat_cols',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('onehotencoder',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   sparse_output=False))]),
                                    Index(

In [17]:
param_grid = {
    'classifier__n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=10)],  # Number of trees
    'classifier__max_features': ['auto', 'sqrt'],  # Number of features to consider at every split
    'classifier__max_depth': [int(x) for x in np.linspace(10, 50, num=5)] + [None],  # Maximum number of levels in tree
    'classifier__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'classifier__min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at each leaf node
    'classifier__bootstrap': [True, False]  # Method of selecting samples for training each tree
}



In [18]:
random_search_rf_smooted = RandomizedSearchCV(estimator = rf_smote_pipeline, param_distributions = param_grid, n_iter = 100, cv = 3, verbose=2, scoring = custom_scorer, random_state=42, n_jobs = -1)


In [19]:
random_search_rf_smooted.fit(X, y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END classifier__bootstrap=False, classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=400; total time=14.0min
[CV] END classifier__bootstrap=False, classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=400; total time=14.7min


  warn(
  warn(
  warn(


[CV] END classifier__bootstrap=False, classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=400; total time=14.1min
[CV] END classifier__bootstrap=False, classifier__max_depth=10, classifier__max_features=auto, classifier__min_samples_leaf=2, classifier__min_samples_split=2, classifier__n_estimators=400; total time= 8.3min
[CV] END classifier__bootstrap=True, classifier__max_depth=50, classifier__max_features=auto, classifier__min_samples_leaf=4, classifier__min_samples_split=2, classifier__n_estimators=900; total time=19.2min


  warn(
  warn(


[CV] END classifier__bootstrap=True, classifier__max_depth=40, classifier__max_features=sqrt, classifier__min_samples_leaf=2, classifier__min_samples_split=2, classifier__n_estimators=900; total time=19.5min
[CV] END classifier__bootstrap=False, classifier__max_depth=10, classifier__max_features=auto, classifier__min_samples_leaf=2, classifier__min_samples_split=2, classifier__n_estimators=400; total time= 8.1min
[CV] END classifier__bootstrap=False, classifier__max_depth=40, classifier__max_features=sqrt, classifier__min_samples_leaf=4, classifier__min_samples_split=5, classifier__n_estimators=100; total time= 3.3min
[CV] END classifier__bootstrap=False, classifier__max_depth=40, classifier__max_features=sqrt, classifier__min_samples_leaf=4, classifier__min_samples_split=5, classifier__n_estimators=100; total time= 3.0min
[CV] END classifier__bootstrap=False, classifier__max_depth=40, classifier__max_features=auto, classifier__min_samples_leaf=2, classifier__min_samples_split=10, clas

  warn(


[CV] END classifier__bootstrap=True, classifier__max_depth=40, classifier__max_features=sqrt, classifier__min_samples_leaf=2, classifier__min_samples_split=2, classifier__n_estimators=900; total time=19.0min
[CV] END classifier__bootstrap=False, classifier__max_depth=10, classifier__max_features=auto, classifier__min_samples_leaf=2, classifier__min_samples_split=2, classifier__n_estimators=400; total time= 8.6min
[CV] END classifier__bootstrap=True, classifier__max_depth=50, classifier__max_features=auto, classifier__min_samples_leaf=4, classifier__min_samples_split=2, classifier__n_estimators=900; total time=18.4min
[CV] END classifier__bootstrap=False, classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=1000; total time=35.9min
[CV] END classifier__bootstrap=False, classifier__max_depth=40, classifier__max_features=auto, classifier__min_samples_leaf=2, classifier__min_samples_split=10, cl

KeyboardInterrupt: 

Takes too long so in the class, we did the hyperparameter tuning in a subset.