In [1]:
import imblearn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from imblearn.over_sampling import SMOTE

from sklearn import metrics, svm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
full_df = pd.read_csv("../data/zonal-means-aggregate-200910-201912.csv")
full_df

Unnamed: 0,location_period_id,year,month,lst_3,lst_2,lst_1,lst_0,precip_3,precip_2,precip_1,precip_0,sm_3,sm_2,sm_1,sm_0,outbreak
0,92,2010,1,32.357544,33.726624,34.437530,35.159393,288.192780,61.916030,2.243044,0.277970,75.846150,55.673077,25.692308,11.038462,0
1,92,2010,2,33.726624,34.437530,35.159393,37.299316,61.916030,2.243044,0.277970,0.205065,55.673077,25.692308,11.038462,5.673077,0
2,92,2010,3,34.437530,35.159393,37.299316,37.331665,2.243044,0.277970,0.205065,0.414880,25.692308,11.038462,5.673077,1.826923,0
3,92,2010,4,35.159393,37.299316,37.331665,39.188446,0.277970,0.205065,0.414880,6.336623,11.038462,5.673077,1.826923,1.634615,0
4,92,2010,5,37.299316,37.331665,39.188446,38.275238,0.205065,0.414880,6.336623,106.946400,5.673077,1.826923,1.634615,23.211538,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57634,16697,2019,8,39.427277,41.711395,29.873108,28.330505,146.727780,308.896500,312.504060,310.206630,63.965520,90.206894,89.000000,85.183910,0
57635,16697,2019,9,41.711395,29.873108,28.330505,32.087708,308.896500,312.504060,310.206630,243.551130,90.206894,89.000000,85.183910,68.770120,0
57636,16697,2019,10,29.873108,28.330505,32.087708,28.918060,312.504060,310.206630,243.551130,207.554380,89.000000,85.183910,68.770120,50.137930,0
57637,16697,2019,11,28.330505,32.087708,28.918060,29.032227,310.206630,243.551130,207.554380,92.452390,85.183910,68.770120,50.137930,44.494250,0


In [3]:
class_counts = full_df["outbreak"].value_counts()
print("Class Distribution:\n", class_counts)

Class Distribution:
 outbreak
0    56982
1      657
Name: count, dtype: int64


The SMOTE algorithm for treating imbalanced datasets cannot deal with missing values (NaNs) for Feature columns, so we need to impute the missing data. Following the methodology used by Campbell et al (2020) we will keep only those districts and months that have data for all of the environmental parameters. 

In [4]:
cleaned_df = full_df.dropna()
cleaned_df

Unnamed: 0,location_period_id,year,month,lst_3,lst_2,lst_1,lst_0,precip_3,precip_2,precip_1,precip_0,sm_3,sm_2,sm_1,sm_0,outbreak
0,92,2010,1,32.357544,33.726624,34.437530,35.159393,288.192780,61.916030,2.243044,0.277970,75.846150,55.673077,25.692308,11.038462,0
1,92,2010,2,33.726624,34.437530,35.159393,37.299316,61.916030,2.243044,0.277970,0.205065,55.673077,25.692308,11.038462,5.673077,0
2,92,2010,3,34.437530,35.159393,37.299316,37.331665,2.243044,0.277970,0.205065,0.414880,25.692308,11.038462,5.673077,1.826923,0
3,92,2010,4,35.159393,37.299316,37.331665,39.188446,0.277970,0.205065,0.414880,6.336623,11.038462,5.673077,1.826923,1.634615,0
4,92,2010,5,37.299316,37.331665,39.188446,38.275238,0.205065,0.414880,6.336623,106.946400,5.673077,1.826923,1.634615,23.211538,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57634,16697,2019,8,39.427277,41.711395,29.873108,28.330505,146.727780,308.896500,312.504060,310.206630,63.965520,90.206894,89.000000,85.183910,0
57635,16697,2019,9,41.711395,29.873108,28.330505,32.087708,308.896500,312.504060,310.206630,243.551130,90.206894,89.000000,85.183910,68.770120,0
57636,16697,2019,10,29.873108,28.330505,32.087708,28.918060,312.504060,310.206630,243.551130,207.554380,89.000000,85.183910,68.770120,50.137930,0
57637,16697,2019,11,28.330505,32.087708,28.918060,29.032227,310.206630,243.551130,207.554380,92.452390,85.183910,68.770120,50.137930,44.494250,0


By removing these rows, we now have fewer outbreak (and non outbreak months) overall.

In [5]:
class_counts = cleaned_df["outbreak"].value_counts()
print("Class Distribution:\n", class_counts)

Class Distribution:
 outbreak
0    42537
1      486
Name: count, dtype: int64


In [6]:
# dropping unecessary columns for analysis and moving "outbreak" to "y"
# variable as it is the feature we are trying to predict
X_cln = cleaned_df.drop(
    columns=["outbreak", "location_period_id", "month", "year"]
)  # all other columns are our feature (predicting) variables

y = cleaned_df["outbreak"]  # our predicted variable

In [7]:
X_cln.describe()

Unnamed: 0,lst_3,lst_2,lst_1,lst_0,precip_3,precip_2,precip_1,precip_0,sm_3,sm_2,sm_1,sm_0
count,43023.0,43023.0,43023.0,43023.0,43023.0,43023.0,43023.0,43023.0,43023.0,43023.0,43023.0,43023.0
mean,37.158442,37.160494,37.159575,37.160048,89.907062,87.845572,85.733842,85.190382,36.929386,36.318348,35.649512,35.147652
std,6.443968,6.442599,6.445109,6.44604,107.000493,106.284706,105.184523,105.293841,27.534466,27.391987,27.124455,27.022193
min,11.579987,15.410004,11.579987,11.579987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32.058625,32.057876,32.058259,32.05609,4.349383,3.942172,3.644698,3.586745,11.371056,11.1,10.895644,10.594145
50%,36.64148,36.644806,36.644287,36.648743,49.72372,46.719387,43.773678,41.48891,30.333334,29.16,28.32,27.375
75%,41.786499,41.787582,41.789642,41.789642,150.554715,147.19072,143.985245,143.580735,61.771243,60.94281,59.675439,59.0
max,64.09555,64.09555,64.09555,64.09555,1467.328,1467.328,1467.328,1467.328,98.166664,100.0,100.0,100.0


Now let's see explore the correlation between all of the environmental parameters we are using. By doing so, we might be able to reduce this feature space. 

In [8]:
spearman = X_cln.corr(method="spearman")
spearman.style.background_gradient(cmap="coolwarm")

Unnamed: 0,lst_3,lst_2,lst_1,lst_0,precip_3,precip_2,precip_1,precip_0,sm_3,sm_2,sm_1,sm_0
lst_3,1.0,0.793452,0.591282,0.442114,-0.391617,-0.236491,-0.107647,-0.02724,-0.469575,-0.334129,-0.182311,-0.07058
lst_2,0.793452,1.0,0.793558,0.591121,-0.485174,-0.412426,-0.254334,-0.10848,-0.52006,-0.490782,-0.355652,-0.195888
lst_1,0.591282,0.793558,1.0,0.793705,-0.518305,-0.516883,-0.441624,-0.266048,-0.501949,-0.545863,-0.522056,-0.381048
lst_0,0.442114,0.591121,0.793705,1.0,-0.507761,-0.545869,-0.543544,-0.457527,-0.431307,-0.520275,-0.572227,-0.547365
precip_3,-0.391617,-0.485174,-0.518305,-0.507761,1.0,0.797373,0.47227,0.131877,0.800683,0.799617,0.603987,0.32648
precip_2,-0.236491,-0.412426,-0.516883,-0.545869,0.797373,1.0,0.797653,0.474518,0.542804,0.797827,0.797513,0.603209
precip_1,-0.107647,-0.254334,-0.441624,-0.543544,0.47227,0.797653,1.0,0.800829,0.206956,0.537904,0.794471,0.795186
precip_0,-0.02724,-0.10848,-0.266048,-0.457527,0.131877,0.474518,0.800829,1.0,-0.10166,0.202959,0.536329,0.792556
sm_3,-0.469575,-0.52006,-0.501949,-0.431307,0.800683,0.542804,0.206956,-0.10166,1.0,0.835907,0.522756,0.19389
sm_2,-0.334129,-0.490782,-0.545863,-0.520275,0.799617,0.797827,0.537904,0.202959,0.835907,1.0,0.834835,0.520948


We observe correlation between precipitation and soil moisture values. This makes sense as one (precip) certainly has an impact on the other (soil moisture levels). We will want to consider this in our model development, as we can perhaps reduce the number of features considered. 

In [9]:
(spearman > 0.8)

Unnamed: 0,lst_3,lst_2,lst_1,lst_0,precip_3,precip_2,precip_1,precip_0,sm_3,sm_2,sm_1,sm_0
lst_3,True,False,False,False,False,False,False,False,False,False,False,False
lst_2,False,True,False,False,False,False,False,False,False,False,False,False
lst_1,False,False,True,False,False,False,False,False,False,False,False,False
lst_0,False,False,False,True,False,False,False,False,False,False,False,False
precip_3,False,False,False,False,True,False,False,False,True,False,False,False
precip_2,False,False,False,False,False,True,False,False,False,False,False,False
precip_1,False,False,False,False,False,False,True,True,False,False,False,False
precip_0,False,False,False,False,False,False,True,True,False,False,False,False
sm_3,False,False,False,False,True,False,False,False,True,True,False,False
sm_2,False,False,False,False,False,False,False,False,True,True,True,False


Now we'll drop those variables that have more than 0.8 correlation (i.e., we'll keep only `sm_0`)

In [10]:
X = X_cln.drop(["sm_1", "sm_2", "sm_3", "precip_1"], axis=1)

In [11]:
spearman = X.corr(method="spearman")
spearman.style.background_gradient(cmap="coolwarm")

Unnamed: 0,lst_3,lst_2,lst_1,lst_0,precip_3,precip_2,precip_0,sm_0
lst_3,1.0,0.793452,0.591282,0.442114,-0.391617,-0.236491,-0.02724,-0.07058
lst_2,0.793452,1.0,0.793558,0.591121,-0.485174,-0.412426,-0.10848,-0.195888
lst_1,0.591282,0.793558,1.0,0.793705,-0.518305,-0.516883,-0.266048,-0.381048
lst_0,0.442114,0.591121,0.793705,1.0,-0.507761,-0.545869,-0.457527,-0.547365
precip_3,-0.391617,-0.485174,-0.518305,-0.507761,1.0,0.797373,0.131877,0.32648
precip_2,-0.236491,-0.412426,-0.516883,-0.545869,0.797373,1.0,0.474518,0.603209
precip_0,-0.02724,-0.10848,-0.266048,-0.457527,0.131877,0.474518,1.0,0.792556
sm_0,-0.07058,-0.195888,-0.381048,-0.547365,0.32648,0.603209,0.792556,1.0


In [12]:
# split the dataset into train and test splits
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

## Accounting for an imbalanced dataset

Below is a useful reference for different techniques use to solve the imbalance of classes in machine learning datasets: 
https://www.analyticsvidhya.com/blog/2020/07/10-techniques-to-deal-with-class-imbalance-in-machine-learning/

### SMOTE

In [13]:
# apply SMOTE to the training data with a 1:10 ratio as used by Campbell et al 2020
smote = SMOTE(
    sampling_strategy=0.1, random_state=42
)  # worked but still reflected only outbreak = 0 category

# apply SMOTE at 1:2 ratio - accuracy is more reflective of minority category, but not biologically relevant
# smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

Applying SMOTE
- Behaves similarly to a data transformation object in that it must be defined and configured, fit on a dataset, then applied to create a new transformed version of the dataset.
- In our code below, we define a SMOTE instance with default parameters that will balance the minority class and then fit and apply it in one step to create a transformed version of our dataset. The `sampling strategy=0.1` means we will `oversample` the minority class (outbreak=1) to have 10 percent number of examples of the majority class (i.e., maintain a 1:10 ratio of outbreaks to non-outbreaks).
- Once transformed, we will expect to see the class distribution of the new transformed dataset, now to be balanced (while maintaining that 1:10 ratio) through the creation of many new synthetic examples in the minority (i.e., outbreak=1) class.

In [14]:
# check the new class distribution after SMOTE
resampled_class_counts = pd.Series(y_resampled).value_counts()
print("\nClass Distribution after SMOTE:\n", resampled_class_counts)


Class Distribution after SMOTE:
 outbreak
0    29758
1     2975
Name: count, dtype: int64


TOMEK LINKS

In [17]:
# from collections import Counter
# from imblearn.over_sampling import RandomOverSampler
# from imblearn.under_sampling import TomekLinks

# tl = RandomOverSampler(sampling_strategy=0.2, random_state=42)

# # fit predictor and target variable
# X_resampled, y_resampled = tl.fit_resample(X_train, y_train)

# # check the new class distribution after TOMEK LINKS
# resampled_class_counts = pd.Series(y_resampled).value_counts()
# # print("\nClass Distribution after Tomek Links:\n", y_tl)
# print("Resampled dataset shape %s" % Counter(y_resampled))

## Model Exploration

### Random Forest

In [18]:
# train your machine learning model on the balanced dataset
clf_cln = RandomForestClassifier(random_state=42)
clf_cln.fit(X_resampled, y_resampled)

In [19]:
# evaluate your model
accuracy = clf_cln.score(X_test, y_test)
print("\nModel Accuracy on Test Set:", accuracy)


Model Accuracy on Test Set: 0.9891531726969861


In [20]:
# run prediction
y_rf_pred = clf_cln.predict(X_test)

In [21]:
# create confusion matrix
cnf_matrix_rf = metrics.confusion_matrix(y_test, y_rf_pred)
cnf_matrix_rf

array([[12766,    13],
       [  127,     1]])

Interpretation of the confusion matrix above: 
In the first quadrant we have correctly classified 12,766 of the non-outbreak occurrences. The second and third quadrants we see 140 incorrectly classified (127 + 13) events for months where there was an outbreak. And we have correctly classified only 1 outbreak month. Our high accuracy is due to the underlying make-up of the data and it's imbalanced nature (i.e., it is classifying all non-outbreak events well as that is the predominant structure of the dataset). So we will want to revisit how we account for this imbalance in the data. 

In [22]:
print("ROCAUC score:", metrics.roc_auc_score(y_test, y_rf_pred))
print("Accuracy score:", metrics.accuracy_score(y_test, y_rf_pred))
print("F1 score:", metrics.f1_score(y_test, y_rf_pred))

ROCAUC score: 0.5033976030010172
Accuracy score: 0.9891531726969861
F1 score: 0.014084507042253521


In [23]:
# train your machine learning model on the balanced dataset (already done in the previous code)

# get feature importances from the trained RandomForestClassifier
feature_importances = clf_cln.feature_importances_

# create a DataFrame to display feature names and their corresponding importances
feature_importance_df = pd.DataFrame(
    {"Feature": X_resampled.columns, "Importance": feature_importances}
)

# sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(
    by="Importance", ascending=False
)

# print the top N most influential features (adjust N as needed)
top_n_features = 10  # Change this to the number of top features you want to display
print(f"Top {top_n_features} Most Influential Features:")
print(feature_importance_df.head(top_n_features))

Top 10 Most Influential Features:
    Feature  Importance
6  precip_0    0.153032
4  precip_3    0.137397
5  precip_2    0.136416
7      sm_0    0.127414
0     lst_3    0.117283
3     lst_0    0.114006
2     lst_1    0.109799
1     lst_2    0.104652


### Support Vector Machines

In [24]:
svm_clean = svm.SVC(random_state=42)
svm_clean.fit(X_resampled, y_resampled)

In [25]:
accuracy_svm = svm_clean.score(X_test, y_test)
print("\nModel Accuracy on Test Set:", accuracy_svm)


Model Accuracy on Test Set: 0.9900829007515302


In [26]:
# run prediction
y_svm_pred = svm_clean.predict(X_test)

In [27]:
cnf_matrix_svm = metrics.confusion_matrix(y_test, y_svm_pred)
cnf_matrix_svm

array([[12779,     0],
       [  128,     0]])

In [28]:
print("ROCAUC score:", metrics.roc_auc_score(y_test, y_svm_pred))
print("Accuracy score:", metrics.accuracy_score(y_test, y_svm_pred))
print("F1 score:", metrics.f1_score(y_test, y_svm_pred))

ROCAUC score: 0.5
Accuracy score: 0.9900829007515302
F1 score: 0.0


Exploring the same dataset using `svm` we find similar results to our Random Forest results, except that things are worse! We will need to revisit how we handle the imbalanced nature of this dataset. 

### Logisitic regression 

Here we will look at the simplest classification (logistic regression) using only the most important feature identified by the Random Forest model - to see if we can explain all outbreak months by precip in the current month alone.

In [29]:
X_precip = X.drop(
    columns=["lst_3", "lst_2", "lst_1", "lst_0", "precip_3", "precip_2", "sm_0"]
)  # keep only "precip_0"

In [30]:
# split the dataset into train and test splits
Xp_train, Xp_test, yp_train, yp_test = train_test_split(
    X_precip, y, test_size=0.3, random_state=42
)

In [31]:
Xp_resampled, yp_resampled = smote.fit_resample(Xp_train, yp_train)

In [32]:
from sklearn.linear_model import LogisticRegression

# create an instance of the model
logreg = LogisticRegression(solver="lbfgs", max_iter=400)

# train the model
logreg.fit(Xp_resampled, yp_resampled)

# run prediction
y_pred = logreg.predict(Xp_test)

In [33]:
# create confusion matrix
from sklearn import metrics

cnf_matrix = metrics.confusion_matrix(yp_test, y_pred)
cnf_matrix

array([[12754,    25],
       [  127,     1]])

In [35]:
print("Accuracy:", metrics.accuracy_score(yp_test, y_pred))

Accuracy: 0.988223444642442
