In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from flood_tool.geo import get_gps_lat_long_from_easting_northing
from sklearn.metrics import roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.utils.class_weight import compute_class_weight
from scipy.stats import randint

In [2]:
data = pd.read_csv('flood_tool/resources/postcodes_labelled.csv')

In [None]:
data.drop_duplicates(inplace=True)

coordinates_lat = get_gps_lat_long_from_easting_northing(data['easting'], data['northing'])
coordinates_df = pd.DataFrame({
    'Latitude': coordinates_lat[0],
    'Longitude': coordinates_lat[1]
})
data = pd.concat([data, coordinates_df], axis=1)
data.drop(columns=['easting', 'northing'], inplace=True)

data['log_medianPrice'] = np.log(data['medianPrice'])

# Create bins for log-transformed medianPrice based on the description of log median price
bins = [-np.inf, 11.9, 14.2, np.inf]
labels = ['Low', 'Medium', 'High']

# Apply log transformation and binning
data['price_category'] = pd.cut(data['log_medianPrice'], bins=bins, labels=labels)
# Display the first few rows to verify
data[['medianPrice', 'log_medianPrice', 'price_category']].head()
# postcodes_labelled.isnull().sum()

#data['nearestWatercourse'].nunique() # 1146 + Nan which is No nearest WaterCourse
data['nearestWatercourse'].fillna('NoStreamNear', inplace=True)

## Define bins and labels
elevation_bins = [-10, 0, 40, 80, np.inf]
elevation_labels = ['BelowSeaLevel', 'Low', 'Medium', 'High']
# Apply binning to the elevation column
data['elevation_category'] = pd.cut(data['elevation'], bins=elevation_bins, labels=elevation_labels)

# Define bins and labels
distance_bins = [0, 630, 1090, 1840, np.inf]
distance_labels = ['Very Close', 'Close', 'Moderate', 'Far']

# Apply binning to the distanceToWatercourse column
data['distanceToWatercourse_category'] = pd.cut(data['distanceToWatercourse'], bins=distance_bins, labels=distance_labels)

bins = [-np.inf, 11.9, 14.2, np.inf]
labels = ['Low', 'Medium', 'High']

# Apply log transformation and binning
data['log_medianPrice'] = np.log(data['medianPrice'])
data['price_category'] = pd.cut(data['log_medianPrice'], bins=bins, labels=labels)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['nearestWatercourse'].fillna('NoStreamNear', inplace=True)


In [None]:
def log_transform(x):
    return np.log1p(x+10)

numeric_features = ['elevation', 'distanceToWatercourse']
numeric_remained = ['Latitude', 'Longitude']
categorical_features = ['soilType']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('log_transform', FunctionTransformer(log_transform, validate=True)),
    ('scaler', RobustScaler())
])

numeric_remained_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('num_rem', numeric_remained_transformer, numeric_remained),
        ('cat', categorical_transformer, categorical_features)
    ])

standard_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

In [None]:
# X include soilType, nearestWatercourse, elevation, distanceToWatercourse, Latitude, Longitude
# y include riskLabel

X = data[['elevation', 'distanceToWatercourse', 'Latitude', 'Longitude', 'soilType']]
y = data['riskLabel']

In [6]:
# enconde y to 0, 1, 2, 3, 4, 5, 6
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

X_train_transformed = standard_pipeline.fit_transform(X_train)
X_test_transformed = standard_pipeline.transform(X_test)

# imbalanced data handling (SMOTE and ADASYN)
smote = SMOTE(sampling_strategy={5: 5000, 6: 2000}, random_state=42)
adasyn = ADASYN(sampling_strategy={5: 6000, 6: 3000}, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_transformed, y_train)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train_resampled, y_train_resampled)

# data augmentation for class 6 and 7
X_train_class6 = X_train_transformed[y_train == 5]
X_train_class7 = X_train_transformed[y_train == 6]
noise6 = np.random.normal(0, 0.01, X_train_class6.shape)
noise7 = np.random.normal(0, 0.01, X_train_class7.shape)
X_augmented = np.vstack([X_train_class6 + noise6, X_train_class7 + noise7])
y_augmented = np.hstack([np.full(X_train_class6.shape[0], 5), np.full(X_train_class7.shape[0], 6)])

X_train_resampled = np.vstack([X_train_resampled, X_augmented])
y_train_resampled = np.hstack([y_train_resampled, y_augmented])

# calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_resampled), y=y_train_resampled)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

XGboost = XGBClassifier(
    random_state=42,
    n_estimators=300,
    max_depth=12,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss'
)
XGboost.fit(X_train_resampled, y_train_resampled)

# predict and decode
y_pred_proba = XGboost.predict_proba(X_test_transformed)
y_pred_encoded = np.argmax(y_pred_proba, axis=1)

# set threshold for class 6 and 7
threshold_6 = 0.3
threshold_7 = 0.3
y_pred_encoded[(y_pred_proba[:, 5] > threshold_6)] = 5
y_pred_encoded[(y_pred_proba[:, 6] > threshold_7)] = 6

y_pred = label_encoder.inverse_transform(y_pred_encoded)
y_test_original = label_encoder.inverse_transform(y_test)

roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
print(f'Overall ROC AUC: {roc_auc:.4f}')

print(classification_report(y_test_original, y_pred))

for i, cls in enumerate(label_encoder.classes_):
    cls_roc_auc = roc_auc_score((y_test == i).astype(int), y_pred_proba[:, i])
    print(f"Class {cls} ROC AUC: {cls_roc_auc:.4f}")

Overall ROC AUC: 0.8617
              precision    recall  f1-score   support

           1       0.79      0.94      0.85     15876
           2       0.69      0.36      0.47      4165
           3       0.46      0.28      0.35      2134
           4       0.60      0.44      0.51       628
           5       0.56      0.27      0.36       730
           6       0.16      0.26      0.20       314
           7       0.14      0.22      0.17       153

    accuracy                           0.73     24000
   macro avg       0.48      0.39      0.42     24000
weighted avg       0.72      0.73      0.71     24000

Class 1 ROC AUC: 0.8325
Class 2 ROC AUC: 0.7942
Class 3 ROC AUC: 0.8757
Class 4 ROC AUC: 0.8903
Class 5 ROC AUC: 0.8859
Class 6 ROC AUC: 0.8539
Class 7 ROC AUC: 0.8994


In [7]:
# Hyperparameter tuning
param_distributions = {
    'n_estimators': [200, 300, 400, 500, 600],
    'max_depth': randint(3, 15),
    'learning_rate': [0.1, 0.3, 0.5, 0.7, 0.9]
}

# Initialize the XGBClassifier with other parameters fixed
xgb = XGBClassifier(
    random_state=42,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss'
)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_distributions,
    n_iter=5,
    scoring='roc_auc_ovr',
    cv=5,
    random_state=42,
    verbose=1,
    n_jobs=-1
)

# Fit RandomizedSearchCV
random_search.fit(X_train_resampled, y_train_resampled)

# Get the best parameters
best_params = random_search.best_params_
print(f"Best parameters: {best_params}")

# Retrain the model with the best parameters
XGboost = XGBClassifier(
    random_state=42,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    **best_params
)
XGboost.fit(X_train_resampled, y_train_resampled)

# Predict and decode
y_pred_proba = XGboost.predict_proba(X_test_transformed)
y_pred_encoded = np.argmax(y_pred_proba, axis=1)

# Set threshold for classes 5 and 6
threshold_5 = 0.3
threshold_6 = 0.3
y_pred_encoded[(y_pred_proba[:, 5] > threshold_5)] = 5
y_pred_encoded[(y_pred_proba[:, 6] > threshold_6)] = 6

y_pred = label_encoder.inverse_transform(y_pred_encoded)
y_test_original = label_encoder.inverse_transform(y_test)

# Evaluate the model
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
print(f'Overall ROC AUC: {roc_auc:.4f}')

print(classification_report(y_test_original, y_pred))

for i, cls in enumerate(label_encoder.classes_):
    cls_roc_auc = roc_auc_score((y_test == i).astype(int), y_pred_proba[:, i])
    print(f"Class {cls} ROC AUC: {cls_roc_auc:.4f}")


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best parameters: {'learning_rate': 0.5, 'max_depth': 9, 'n_estimators': 400}
Overall ROC AUC: 0.8403
              precision    recall  f1-score   support

           1       0.80      0.89      0.84     15876
           2       0.57      0.42      0.48      4165
           3       0.43      0.36      0.40      2134
           4       0.52      0.40      0.45       628
           5       0.46      0.28      0.35       730
           6       0.16      0.25      0.19       314
           7       0.15      0.21      0.18       153

    accuracy                           0.72     24000
   macro avg       0.44      0.40      0.41     24000
weighted avg       0.70      0.72      0.70     24000

Class 1 ROC AUC: 0.8165
Class 2 ROC AUC: 0.7730
Class 3 ROC AUC: 0.8611
Class 4 ROC AUC: 0.8635
Class 5 ROC AUC: 0.8627
Class 6 ROC AUC: 0.8252
Class 7 ROC AUC: 0.8805
