In [None]:
# This model is used to predict the local authority of a location based on classifier model (XGBoostclassifier)

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from flood_tool.geo import get_gps_lat_long_from_easting_northing
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

In [6]:
data = pd.read_csv('flood_tool/resources/postcodes_labelled.csv')

In [7]:
data.drop_duplicates(inplace=True)

coordinates_lat = get_gps_lat_long_from_easting_northing(data['easting'], data['northing'])
coordinates_df = pd.DataFrame({
    'Latitude': coordinates_lat[0],
    'Longitude': coordinates_lat[1]
})
data = pd.concat([data, coordinates_df], axis=1)
data.drop(columns=['easting', 'northing'], inplace=True)

data['log_medianPrice'] = np.log(data['medianPrice'])

# Create bins for log-transformed medianPrice based on the description of log median price
bins = [-np.inf, 11.9, 14.2, np.inf]
labels = ['Low', 'Medium', 'High']

# Apply log transformation and binning
data['price_category'] = pd.cut(data['log_medianPrice'], bins=bins, labels=labels)
# Display the first few rows to verify
data[['medianPrice', 'log_medianPrice', 'price_category']].head()
# postcodes_labelled.isnull().sum()

#data['nearestWatercourse'].nunique() # 1146 + Nan which is No nearest WaterCourse
data['nearestWatercourse'].fillna('NoStreamNear', inplace=True)

## Define bins and labels
elevation_bins = [-10, 0, 40, 80, np.inf]
elevation_labels = ['BelowSeaLevel', 'Low', 'Medium', 'High']
# Apply binning to the elevation column
data['elevation_category'] = pd.cut(data['elevation'], bins=elevation_bins, labels=elevation_labels)

# Define bins and labels
distance_bins = [0, 630, 1090, 1840, np.inf]
distance_labels = ['Very Close', 'Close', 'Moderate', 'Far']

# Apply binning to the distanceToWatercourse column
data['distanceToWatercourse_category'] = pd.cut(data['distanceToWatercourse'], bins=distance_bins, labels=distance_labels)

bins = [-np.inf, 11.9, 14.2, np.inf]
labels = ['Low', 'Medium', 'High']

# Apply log transformation and binning
data['log_medianPrice'] = np.log(data['medianPrice'])
data['price_category'] = pd.cut(data['log_medianPrice'], bins=bins, labels=labels)

def log_transform(x):
    return np.log1p(x+np.abs(data['elevation'].min()))

numeric_features = ['elevation', 'distanceToWatercourse', 'soilType_encoded', 'nearestWatercourse_encoded']
numeric_remained = ['Latitude', 'Longitude']
categorical_features = []

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('log_transform', FunctionTransformer(log_transform, validate=True)),
    ('scaler', RobustScaler())
])

numeric_remained_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('label', LabelEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('num_rem', numeric_remained_transformer, numeric_remained),
        ('cat', categorical_transformer, categorical_features)
    ])

standard_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['nearestWatercourse'].fillna('NoStreamNear', inplace=True)


In [8]:
# Label Encoding
label_encoder = LabelEncoder()
data['localAuthority_encoded'] = label_encoder.fit_transform(data['localAuthority'])
data['soilType_encoded'] = label_encoder.fit_transform(data['soilType'])
data['nearestWatercourse_encoded'] = label_encoder.fit_transform(data['nearestWatercourse'])

X = data[['elevation', 'soilType_encoded', 'nearestWatercourse_encoded', 'distanceToWatercourse', 'Latitude', 'Longitude']]
y = data['localAuthority_encoded']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Data Transformation
X_train_transformed = standard_pipeline.fit_transform(X_train)
X_test_transformed = standard_pipeline.transform(X_test)

# Train the Model
model = XGBClassifier(eval_metric=['mlogloss', 'merror'], random_state=42)
model.fit(X_train_transformed, y_train)

# Predict the Labels
y_pred = model.predict(X_test_transformed)

# Decode Predicted and Actual Labels to Original Names
y_pred_decoded = label_encoder.inverse_transform(y_pred)
y_test_decoded = label_encoder.inverse_transform(y_test)

# Evaluate the Model
accuracy = accuracy_score(y_test_decoded, y_pred_decoded)
print("Pipeline Model Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test_decoded, y_pred_decoded))

Pipeline Model Accuracy: 0.9802916666666667
Classification Report:
                       precision    recall  f1-score   support

        Abberd Brook       0.98      0.97      0.98       109
         Abbey River       0.99      0.97      0.98       242
        Alder Stream       0.98      0.99      0.98       219
   Aldingbourne Rife       0.99      0.99      0.99       285
         Allen River       0.98      0.99      0.98       346
         Aller Brook       0.98      0.95      0.96       240
 Allerton Moor Rhyne       1.00      0.99      0.99       479
        Alphin Brook       0.99      0.96      0.97       217
          Alsa Brook       0.98      1.00      0.99       289
            Am Brook       0.99      0.99      0.99       557
       Andover Canal       0.94      0.98      0.96       359
       Andrew's Gill       0.98      0.99      0.98       230
       Annwood Brook       0.96      0.99      0.98       281
          Arch Brook       1.00      1.00      1.00      1142
 

In [9]:
# define the hyperparameter space
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.2),
    #'subsample': uniform(0.7, 0.3),
    #'colsample_bytree': uniform(0.7, 0.3),
    #'gamma': uniform(0, 0.2)
}

# define the model
random_search = RandomizedSearchCV(
    estimator=XGBClassifier(eval_metric='mlogloss', random_state=42),
    param_distributions=param_dist,
    n_iter=10, 
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# fit the model
random_search.fit(X_train_transformed, y_train)

print("Best Parameters:", random_search.best_params_)
print("Best Cross-Validation Accuracy:", random_search.best_score_)

# evaluate the best model
best_model_random = random_search.best_estimator_

y_pred_random = best_model_random.predict(X_test_transformed)
#y_pred_random = [round(value) for value in y_pred_random]

# Decode Predicted and Actual Labels to Original Names
y_pred_decoded = label_encoder.inverse_transform(y_pred_random)
y_test_decoded = label_encoder.inverse_transform(y_test)

accuracy_random = accuracy_score(y_test_decoded, y_pred_decoded)
print("Best Model Test Accuracy:", accuracy_random)
print("Classification Report:\n", classification_report(y_test_decoded, y_pred_decoded))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'learning_rate': 0.10184977839317343, 'max_depth': 7, 'n_estimators': 149}
Best Cross-Validation Accuracy: 0.9801607142857144
Best Model Test Accuracy: 0.982375
Classification Report:
                       precision    recall  f1-score   support

        Abberd Brook       0.98      0.97      0.98       109
         Abbey River       0.99      0.99      0.99       242
        Alder Stream       0.98      0.99      0.98       219
   Aldingbourne Rife       0.99      0.99      0.99       285
         Allen River       0.97      0.99      0.98       346
         Aller Brook       0.97      0.95      0.96       240
 Allerton Moor Rhyne       1.00      0.99      0.99       479
        Alphin Brook       0.98      0.99      0.98       217
          Alsa Brook       0.99      1.00      1.00       289
            Am Brook       0.99      0.99      0.99       557
       Andover Canal       0.96      0.99      0.97  