In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from flood_tool.geo import get_gps_lat_long_from_easting_northing
from utils.utils import init_logging, compute_metrics, tune_decision_boundary
from flood_tool.data_processor import DataProcessor


from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.pipeline import make_pipeline as make_pipeline_imblearn
from imblearn.pipeline import Pipeline as imblearnPipeline
from sklearn.ensemble import RandomForestClassifier


init_logging()

%load_ext autoreload
%autoreload 2

rds = 42

In [3]:
dp = DataProcessor(postcodes_path='../flood_tool/resources/postcodes_labelled.csv', resource_path='../flood_tool/resources/')

2023-11-23 09:30:21,381 [flood_tool.data_processor] INFO: Data loaded successfully loaded.


In [4]:
# data = dp.df_postcodes  # normal labelled
df_rainfall = dp.get_rainfall_dataframe()
data = dp.get_combined_dataframe()

2023-11-23 09:30:22,912 [flood_tool.data_processor] INFO: Data successfully merged.


In [5]:
from flood_tool.tool import Tool
tool = Tool()
tool.predict_historic_flooding(postcodes=data.postcode.tolist(), method='rf_classifier')


2023-11-23 09:30:25,104 [flood_tool.data_processor] INFO: Data loaded successfully loaded.
2023-11-23 09:30:25,662 [flood_tool.data_processor] INFO: Data successfully merged.
2023-11-23 09:31:11,182 [flood_tool.models.predictor] INFO: Model successfully fit.
2023-11-23 09:31:11,183 [flood_tool.tool] INFO: rf_classifier: {'accuracy': 0.9818333333333333, 'precision': 0.7919463087248322, 'recall': 0.6020408163265306, 'f1': 0.6840579710144928}


postcode
OL9 7NS    0
OL9 7DX    0
OL9 7AZ    0
OL9 7LY    0
OL9 7DF    0
          ..
S96 5XX    1
CB6 1EH    0
CW9 6NF    0
CW9 6LZ    0
B62 0QH    0
Name: predictions, Length: 30000, dtype: int64

In [None]:
print(data.isna().sum())

In [None]:
data.columns

In [None]:
feats = ['latitude', 'longitude', 'soilType', 'elevation', 'typical_average_rainfall_per_hour', 'wet_average_rainfall_per_hour', 'postcodeSector']

In [None]:
X = data[feats]

In [None]:
y = data.historicallyFlooded.astype(int)

In [None]:
from flood_tool.models.constant import RDS
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RDS, stratify=y)

In [None]:
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

num_cols = X_train.select_dtypes(include=np.number).columns
cat_cols = X_train.select_dtypes(exclude=np.number).columns

num_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

cat_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
    ]
)

model_pipe = imblearnPipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(
        n_jobs=-1,
        random_state=RDS, 
        class_weight='balanced_subsample', 
        max_depth=33,
        min_samples_leaf=3, 
        n_estimators=761
    )
     ),
])


In [None]:
model_pipe.get_params()

In [None]:
model_pipe.fit(X_train, y_train)

In [None]:
y_hat = model_pipe.predict(X_test)

In [None]:
compute_metrics(y_test, y_hat, regression=False);  # no smote

In [None]:
import scipy.stats as stats
param_distribution = {
    'model__class_weight': ['balanced', 'balanced_subsample'],
    'model__max_depth': stats.randint(10, 100),
    'model__n_estimators': stats.randint(100, 1000),
    'model__min_samples_leaf': stats.randint(1, 20),
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

rand_cv = RandomizedSearchCV(
    estimator=model_pipe, 
    param_distributions=param_distribution, 
    cv=StratifiedKFold(n_splits=5).split(X_train, y=y_train), 
    n_iter=25,
    n_jobs=-1, 
    random_state=RDS,
    scoring='average_precision',
    verbose=1,
)


In [None]:
rand_cv.fit(X_train, y_train)

In [None]:
rand_cv.best_score_

In [None]:
rand_cv.best_params_

In [None]:
best_model = rand_cv.best_estimator_

In [None]:
best_model.fit(X_train, y_train)

In [None]:
y_hat = best_model.predict(X_test)
y_hat_pp = best_model.predict_proba(X_test)

In [None]:
from utils.utils import compute_metrics
compute_metrics(y_test, y_hat, regression=False, y_pp=y_hat_pp[:, 1]);

In [None]:
model_pipe = imblearnPipeline([
    ('preprocessor', preprocessor),
    ('oversample', SMOTE(random_state=42)),  
    ('model', RandomForestClassifier(
        class_weight='balanced_subsample', 
        random_state=42, 
        max_depth=33,
        min_samples_leaf=3, 
        n_estimators=761)
     )
])

In [None]:
model_pipe.fit(X_train, y_train)

In [None]:
y_hat = model_pipe.predict(X_test)
y_hat_pp = model_pipe.predict_proba(X_test)

In [None]:
from utils.utils import compute_metrics
compute_metrics(y_test, y_hat, regression=False, y_pp=y_hat_pp[:, 1]);

In [None]:
from utils.utils import tune_decision_boundary
thold, thold_idx = tune_decision_boundary(model=model_pipe, tune_metric='recall', X=X_train, y=y_train, upper_qty=0.95, return_preds=False)

In [None]:
thold

In [None]:
compute_metrics(y_test, y_hat_pp[:, 1] > 0.2, regression=False);

In [None]:
compute_metrics(y_test, y_hat_pp[:, 1] > 0.2, regression=False);

In [None]:
from flood_tool.tool import Tool

tool = Tool()

In [None]:
data

In [162]:
tool.predict_historic_flooding(postcodes=data.postcode.tolist(), method='rf_classifier')

2023-11-22 21:51:29,681 [flood_tool.models.predictor] INFO: Model successfully fit.


postcode
OL9 7NS     0
WV13 2LR    0
LS12 1LZ    0
SK15 1TS    0
TS17 9NN    0
           ..
LS16 0BP    0
SK8 4PG     0
HD7 4PA     0
NE16 5YT    0
B8 2RQ      0
Name: predictions, Length: 30000, dtype: int64

In [254]:
tool = Tool()

2023-11-23 08:05:41,423 [flood_tool.data_processor] INFO: Data loaded successfully loaded.
2023-11-23 08:05:41,625 [flood_tool.data_processor] INFO: Data loaded successfully loaded.
2023-11-23 08:05:42,174 [flood_tool.data_processor] INFO: Data successfully merged.
2023-11-23 08:05:42,446 [flood_tool.data_processor] INFO: Data successfully merged.


486