In [48]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score, confusion_matrix
import numpy as np
import pandas as pd
import os
import sys
sys.path.append('..')
import flood_tool as ft
import seaborn as sns

In [49]:
df  = pd.read_csv(os.path.join(ft._data_dir,
                               'postcodes_labelled.csv'))

In [50]:
def add_postcode_features(df):
    listed_postcodes = df['postcode'].tolist()
    postcode_areas, postcode_districts, postcode_sectors, postcode_units = [], [], [], []

    for elem in listed_postcodes:

        if len(elem) == 7:
            postcode_area = elem[:2]
            postcode_district = elem[:3]
            postcode_sector = elem[:3]+elem[4:5]
            postcode_unit = elem[:3]+elem[4:7]

        if len(elem) == 6:
            postcode_area = elem[:1]
            postcode_district = elem[:2]
            postcode_sector = elem[:2]+elem[3:4]
            postcode_unit = elem[:2]+elem[3:6]
        
        if len(elem) == 8:
            postcode_area = elem[:2]
            postcode_district = elem[:4]
            postcode_sector = elem[:4]+elem[5:6]
            postcode_unit = elem[:4]+elem[5:8]

        postcode_areas.append(postcode_area)
        postcode_districts.append(postcode_district)
        postcode_sectors.append(postcode_sector)
        postcode_units.append(postcode_unit)
    df['postcode_area'] = postcode_areas
    df['postcode_district'] = postcode_districts
    df['postcode_sector'] = postcode_sectors
    df['postcode_unit'] = postcode_units
    return df

df = add_postcode_features(df)
df

Unnamed: 0,postcode,easting,northing,soilType,elevation,localAuthority,riskLabel,medianPrice,historicallyFlooded,postcode_area,postcode_district,postcode_sector,postcode_unit
0,OL9 7NS,390978,403269,Unsurveyed/Urban,130,Oldham,1,119100.0,False,OL,OL9,OL97,OL97NS
1,WV13 2LR,396607,298083,Unsurveyed/Urban,130,Walsall,1,84200.0,False,WV,WV13,WV132,WV132LR
2,LS12 1LZ,427859,432937,Unsurveyed/Urban,60,Leeds,1,134900.0,False,LS,LS12,LS121,LS121LZ
3,SK15 1TS,395560,397900,Unsurveyed/Urban,120,Tameside,1,170200.0,False,SK,SK15,SK151,SK151TS
4,TS17 9NN,445771,515362,Unsurveyed/Urban,20,Stockton-on-Tees,1,190600.0,False,TS,TS17,TS179,TS179NN
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,LS16 0BP,425977,438923,Unsurveyed/Urban,160,Leeds,1,,False,LS,LS16,LS160,LS160BP
29996,SK8 4PG,384808,387982,Unsurveyed/Urban,40,Stockport,1,328700.0,False,SK,SK8,SK84,SK84PG
29997,HD7 4PA,409215,416819,Cambisols,310,Kirklees,1,214500.0,False,HD,HD7,HD74,HD74PA
29998,NE16 5YT,419672,560517,Unsurveyed/Urban,130,Gateshead,1,273100.0,False,NE,NE16,NE165,NE165YT


In [51]:
class OrdinalTransformer(TransformerMixin, BaseEstimator):
    
    def __init__(self, category_mapping, unknown='ignore'):
        self.category_mapping = category_mapping
        self.unknown = unknown
        self.category_dicts = {col: {cat: idx for idx, cat in enumerate(categories)} for col, categories in category_mapping.items()}

    def fit(self, X=None, y=None):
        return self

    def transform(self, X, y=None):
        X = pd.DataFrame(X, columns=['soilType'])
        X_transformed = X.copy()
        for col, categories in self.category_mapping.items():
            X_transformed[col] = X[col].apply(lambda x: self.category_dicts[col].get(x, self.handle_unknown(col, x)))
        return X_transformed

    def handle_unknown(self, column, value):
        if self.unknown == 'ignore':
            return value
        elif self.unknown == 'use_max':
            return max(self.category_dicts[column].values()) + 1
        else:
            raise ValueError(f"Unknown handling mode '{self.unknown}' not supported.")

In [52]:
X = df.drop(columns= ['riskLabel', 'medianPrice', 'historicallyFlooded', 'localAuthority'])
y = df['historicallyFlooded']
X

Unnamed: 0,postcode,easting,northing,soilType,elevation,postcode_area,postcode_district,postcode_sector,postcode_unit
0,OL9 7NS,390978,403269,Unsurveyed/Urban,130,OL,OL9,OL97,OL97NS
1,WV13 2LR,396607,298083,Unsurveyed/Urban,130,WV,WV13,WV132,WV132LR
2,LS12 1LZ,427859,432937,Unsurveyed/Urban,60,LS,LS12,LS121,LS121LZ
3,SK15 1TS,395560,397900,Unsurveyed/Urban,120,SK,SK15,SK151,SK151TS
4,TS17 9NN,445771,515362,Unsurveyed/Urban,20,TS,TS17,TS179,TS179NN
...,...,...,...,...,...,...,...,...,...
29995,LS16 0BP,425977,438923,Unsurveyed/Urban,160,LS,LS16,LS160,LS160BP
29996,SK8 4PG,384808,387982,Unsurveyed/Urban,40,SK,SK8,SK84,SK84PG
29997,HD7 4PA,409215,416819,Cambisols,310,HD,HD7,HD74,HD74PA
29998,NE16 5YT,419672,560517,Unsurveyed/Urban,130,NE,NE16,NE165,NE165YT


In [53]:
num_features = X.select_dtypes(include=np.number).columns
num_features = num_features.tolist()
num_features

['easting', 'northing', 'elevation']

In [54]:
ord_features = ['soilType']
cat_features = ['postcode_district']

In [55]:
columns = num_features + ord_features + cat_features
X = X[columns]

In [56]:
soilType_mapping = {'soilType': [
    'Luvisols',
    'Cambisols',
    'Arenosols',
    'Leptosols',
    'Podsols',
    'Planosols',
    'Stagnosols',
    'Gleysols',
    'Histosols',
    'Unsurveyed/Urban']}

In [57]:
### score around 0.68 with RF

preprocessor = ColumnTransformer([
    ('num_transformer', make_pipeline(SimpleImputer(strategy='mean'),
                                              StandardScaler()), num_features),
    ('cat_transformer', make_pipeline(SimpleImputer(strategy ='most_frequent'), 
                                      OneHotEncoder(sparse_output=False, handle_unknown='ignore')), cat_features),
    ('ord_transformer', make_pipeline(SimpleImputer(strategy ='most_frequent'), OrdinalTransformer(soilType_mapping)), ord_features)
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', RandomForestClassifier(class_weight='balanced', random_state=42))
])

In [58]:
preprocessor = ColumnTransformer([
            ('num_transformer', make_pipeline(SimpleImputer(strategy='mean'),
                                              StandardScaler()), num_features),
            ('ord_transformer', make_pipeline(SimpleImputer(strategy='most_frequent'),
                                              OneHotEncoder(sparse_output=False, handle_unknown='ignore')), ord_features)
        ])


pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', RandomForestClassifier(class_weight='balanced', random_state=42))
])

In [59]:
pipeline.fit(X, y)

In [60]:
test  = pd.read_csv(os.path.join(ft._data_dir,
                               'postcodes_unlabelled.csv'))
test = add_postcode_features(test)
pipeline.predict(test[columns])

array([False, False, False, ..., False, False, False])

# Optional

In [61]:
X = df.drop(columns= ['riskLabel', 'medianPrice', 'historicallyFlooded', 'localAuthority'])
y = df['historicallyFlooded']

df = add_postcode_features(df)
y = df['historicallyFlooded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

In [62]:
pipeline.predict(X_test)
f1_score(y_test, pipeline.predict(X_test), average='weighted')

0.9825923219884944

In [63]:
confusion_matrix(y_test, pipeline.predict(X_test))

array([[5790,   23],
       [  74,  113]])