In [None]:
import imblearn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from imblearn.over_sampling import SMOTE

from sklearn import metrics, svm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [None]:
full_df = pd.read_csv("../data/zonal-means-aggregate-200910-201912.csv")
full_df

In [None]:
class_counts = full_df["outbreak"].value_counts()
print("Class Distribution:\n", class_counts)

The SMOTE algorithm for treating imbalanced datasets cannot deal with missing values (NaNs) for Feature columns, so we need to impute the missing data. Following the methodology used by Campbell et al (2020) we will keep only those districts and months that have data for all of the environmental parameters. 

In [None]:
cleaned_df = full_df.dropna()
cleaned_df

By removing these rows, we now have fewer outbreak (and non outbreak months) overall.

In [None]:
class_counts = cleaned_df["outbreak"].value_counts()
print("Class Distribution:\n", class_counts)

In [None]:
# dropping unecessary columns for analysis and moving "outbreak" to "y"
# variable as it is the feature we are trying to predict
X_cln = cleaned_df.drop(
    columns=["outbreak", "location_period_id", "month", "year"]
)  # all other columns are our feature (predicting) variables

y = cleaned_df["outbreak"]  # our predicted variable

In [None]:
X_cln.describe()

Now let's see explore the correlation between all of the environmental parameters we are using. By doing so, we might be able to reduce this feature space. 

In [None]:
spearman = X_cln.corr(method="spearman")
spearman.style.background_gradient(cmap="coolwarm")

We observe correlation between precipitation and soil moisture values. This makes sense as one (precip) certainly has an impact on the other (soil moisture levels). We will want to consider this in our model development, as we can perhaps reduce the number of features considered. 

In [None]:
(spearman > 0.8)

Now we'll drop those variables that have more than 0.8 correlation (i.e., we'll keep only `sm_0`)

In [None]:
X = X_cln.drop(["sm_1", "sm_2", "sm_3", "precip_1"], axis=1)

In [None]:
spearman = X.corr(method="spearman")
spearman.style.background_gradient(cmap="coolwarm")

In [None]:
# split the dataset into train and test splits
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

## Accounting for an imbalanced dataset

Below is a useful reference for different techniques use to solve the imbalance of classes in machine learning datasets: 
https://www.analyticsvidhya.com/blog/2020/07/10-techniques-to-deal-with-class-imbalance-in-machine-learning/

### SMOTE

In [None]:
# apply SMOTE to the training data with a 1:10 ratio as used by Campbell et al 2020
smote = SMOTE(
    sampling_strategy=0.1, random_state=42
)  # worked but still reflected only outbreak = 0 category

# apply SMOTE at 1:2 ratio - accuracy is more reflective of minority category, but not biologically relevant
# smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

Applying SMOTE
- Behaves similarly to a data transformation object in that it must be defined and configured, fit on a dataset, then applied to create a new transformed version of the dataset.
- In our code below, we define a SMOTE instance with default parameters that will balance the minority class and then fit and apply it in one step to create a transformed version of our dataset. The `sampling strategy=0.1` means we will `oversample` the minority class (outbreak=1) to have 10 percent number of examples of the majority class (i.e., maintain a 1:10 ratio of outbreaks to non-outbreaks).
- Once transformed, we will expect to see the class distribution of the new transformed dataset, now to be balanced (while maintaining that 1:10 ratio) through the creation of many new synthetic examples in the minority (i.e., outbreak=1) class.

In [None]:
# check the new class distribution after SMOTE
resampled_class_counts = pd.Series(y_resampled).value_counts()
print("\nClass Distribution after SMOTE:\n", resampled_class_counts)

TOMEK LINKS

In [None]:
# from collections import Counter
# from imblearn.over_sampling import RandomOverSampler
# from imblearn.under_sampling import TomekLinks

# tl = RandomOverSampler(sampling_strategy=0.2, random_state=42)

# # fit predictor and target variable
# X_resampled, y_resampled = tl.fit_resample(X_train, y_train)

# # check the new class distribution after TOMEK LINKS
# resampled_class_counts = pd.Series(y_resampled).value_counts()
# # print("\nClass Distribution after Tomek Links:\n", y_tl)
# print("Resampled dataset shape %s" % Counter(y_resampled))

## Model Exploration

### Random Forest

In [None]:
# train your machine learning model on the balanced dataset
clf_cln = RandomForestClassifier(random_state=42)
clf_cln.fit(X_resampled, y_resampled)

In [None]:
# evaluate your model
accuracy = clf_cln.score(X_test, y_test)
print("\nModel Accuracy on Test Set:", accuracy)

In [None]:
# run prediction
y_rf_pred = clf_cln.predict(X_test)

In [None]:
# create confusion matrix
cnf_matrix_rf = metrics.confusion_matrix(y_test, y_rf_pred)
cnf_matrix_rf

Interpretation of the confusion matrix above: 
In the first quadrant we have correctly classified 12,766 of the non-outbreak occurrences. The second and third quadrants we see 140 incorrectly classified (127 + 13) events for months where there was an outbreak. And we have correctly classified only 1 outbreak month. Our high accuracy is due to the underlying make-up of the data and it's imbalanced nature (i.e., it is classifying all non-outbreak events well as that is the predominant structure of the dataset). So we will want to revisit how we account for this imbalance in the data. 

In [None]:
print("ROCAUC score:", metrics.roc_auc_score(y_test, y_rf_pred))
print("Accuracy score:", metrics.accuracy_score(y_test, y_rf_pred))
print("F1 score:", metrics.f1_score(y_test, y_rf_pred))

In [None]:
# train your machine learning model on the balanced dataset (already done in the previous code)

# get feature importances from the trained RandomForestClassifier
feature_importances = clf_cln.feature_importances_

# create a DataFrame to display feature names and their corresponding importances
feature_importance_df = pd.DataFrame(
    {"Feature": X_resampled.columns, "Importance": feature_importances}
)

# sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(
    by="Importance", ascending=False
)

# print the top N most influential features (adjust N as needed)
top_n_features = 10  # Change this to the number of top features you want to display
print(f"Top {top_n_features} Most Influential Features:")
print(feature_importance_df.head(top_n_features))

### Support Vector Machines

In [None]:
svm_clean = svm.SVC(random_state=42)
svm_clean.fit(X_resampled, y_resampled)

In [None]:
accuracy_svm = svm_clean.score(X_test, y_test)
print("\nModel Accuracy on Test Set:", accuracy_svm)

In [None]:
# run prediction
y_svm_pred = svm_clean.predict(X_test)

In [None]:
cnf_matrix_svm = metrics.confusion_matrix(y_test, y_svm_pred)
cnf_matrix_svm

In [None]:
print("ROCAUC score:", metrics.roc_auc_score(y_test, y_svm_pred))
print("Accuracy score:", metrics.accuracy_score(y_test, y_svm_pred))
print("F1 score:", metrics.f1_score(y_test, y_svm_pred))

Exploring the same dataset using `svm` we find similar results to our Random Forest results, except that things are worse! We will need to revisit how we handle the imbalanced nature of this dataset. 

### Logisitic regression 

Here we will look at the simplest classification (logistic regression) using only the most important feature identified by the Random Forest model - to see if we can explain all outbreak months by precip in the current month alone.

In [None]:
X_precip = X.drop(
    columns=["lst_3", "lst_2", "lst_1", "lst_0", "precip_3", "precip_2", "sm_0"]
)  # keep only "precip_0"

In [None]:
# split the dataset into train and test splits
Xp_train, Xp_test, yp_train, yp_test = train_test_split(
    X_precip, y, test_size=0.3, random_state=42
)

In [None]:
Xp_resampled, yp_resampled = smote.fit_resample(Xp_train, yp_train)

In [None]:
from sklearn.linear_model import LogisticRegression

# create an instance of the model
logreg = LogisticRegression(solver="lbfgs", max_iter=400)

# train the model
logreg.fit(Xp_resampled, yp_resampled)

# run prediction
y_pred = logreg.predict(Xp_test)

In [None]:
# create confusion matrix
from sklearn import metrics

cnf_matrix = metrics.confusion_matrix(yp_test, y_pred)
cnf_matrix

In [None]:
print("Accuracy:", metrics.accuracy_score(yp_test, y_pred))