In [1]:
import imblearn
import pandas as pd

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
full_df = pd.read_csv("../data/zonal-means-aggregate-200910-201912.csv")
full_df

Unnamed: 0,location_period_id,year,month,lst_3,lst_2,lst_1,lst_0,precip_3,precip_2,precip_1,precip_0,sm_3,sm_2,sm_1,sm_0,outbreak
0,92,2010,1,32.357544,33.726624,34.437530,35.159393,288.192780,61.916030,2.243044,0.277970,75.846150,55.673077,25.692308,11.038462,0
1,92,2010,2,33.726624,34.437530,35.159393,37.299316,61.916030,2.243044,0.277970,0.205065,55.673077,25.692308,11.038462,5.673077,0
2,92,2010,3,34.437530,35.159393,37.299316,37.331665,2.243044,0.277970,0.205065,0.414880,25.692308,11.038462,5.673077,1.826923,0
3,92,2010,4,35.159393,37.299316,37.331665,39.188446,0.277970,0.205065,0.414880,6.336623,11.038462,5.673077,1.826923,1.634615,0
4,92,2010,5,37.299316,37.331665,39.188446,38.275238,0.205065,0.414880,6.336623,106.946400,5.673077,1.826923,1.634615,23.211538,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57634,16697,2019,8,39.427277,41.711395,29.873108,28.330505,146.727780,308.896500,312.504060,310.206630,63.965520,90.206894,89.000000,85.183910,0
57635,16697,2019,9,41.711395,29.873108,28.330505,32.087708,308.896500,312.504060,310.206630,243.551130,90.206894,89.000000,85.183910,68.770120,0
57636,16697,2019,10,29.873108,28.330505,32.087708,28.918060,312.504060,310.206630,243.551130,207.554380,89.000000,85.183910,68.770120,50.137930,0
57637,16697,2019,11,28.330505,32.087708,28.918060,29.032227,310.206630,243.551130,207.554380,92.452390,85.183910,68.770120,50.137930,44.494250,0


In [34]:
class_counts = full_df["outbreak"].value_counts()
print("Class Distribution:\n", class_counts)

Class Distribution:
 outbreak
0    56982
1      657
Name: count, dtype: int64


In [35]:
# rename target columns to focus on oversample of minority class
X = full_df.drop(
    columns=["outbreak", "location_period_id", "year"]
)  # all other columns are our feature (predicting) variables

y = full_df["outbreak"]  # our predicted variable

In [38]:
X.describe()

Unnamed: 0,month,lst_3,lst_2,lst_1,lst_0,precip_3,precip_2,precip_1,precip_0,sm_3,sm_2,sm_1,sm_0
count,57639.0,54327.0,54327.0,54327.0,54327.0,56577.0,56577.0,56577.0,56577.0,45452.0,45452.0,45452.0,45452.0
mean,6.60983,36.376298,36.376235,36.376149,36.376076,97.146921,97.145496,97.146035,97.146328,36.419241,36.418675,36.418429,36.418582
std,3.483775,6.366808,6.366866,6.366933,6.366978,111.64296,111.641036,111.640939,111.640712,27.46128,27.460732,27.46061,27.46057
min,1.0,11.579987,11.579987,11.579987,11.579987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,31.448807,31.448487,31.44841,31.448273,6.422441,6.422441,6.422441,6.42506,11.075874,11.075874,11.075874,11.075874
50%,7.0,35.592438,35.592438,35.592438,35.592438,66.09389,66.09389,66.10337,66.10337,29.361111,29.361111,29.361111,29.37004
75%,10.0,40.831009,40.831009,40.831009,40.831009,156.02246,156.02246,156.02246,156.02246,61.0,61.0,61.0,61.0
max,12.0,64.09555,64.09555,64.09555,64.09555,1541.3778,1541.3778,1541.3778,1541.3778,100.0,100.0,100.0,100.0


The SMOTE algorithm for treating imbalanced datasets cannot deal with missing values (NaNs) for Feature columns, so we need to impute the missing data

### Option A: Retain only those districts with all environmental data present

In [44]:
cleaned_df = full_df.dropna()
cleaned_df

Unnamed: 0,location_period_id,year,month,lst_3,lst_2,lst_1,lst_0,precip_3,precip_2,precip_1,precip_0,sm_3,sm_2,sm_1,sm_0,outbreak
0,92,2010,1,32.357544,33.726624,34.437530,35.159393,288.192780,61.916030,2.243044,0.277970,75.846150,55.673077,25.692308,11.038462,0
1,92,2010,2,33.726624,34.437530,35.159393,37.299316,61.916030,2.243044,0.277970,0.205065,55.673077,25.692308,11.038462,5.673077,0
2,92,2010,3,34.437530,35.159393,37.299316,37.331665,2.243044,0.277970,0.205065,0.414880,25.692308,11.038462,5.673077,1.826923,0
3,92,2010,4,35.159393,37.299316,37.331665,39.188446,0.277970,0.205065,0.414880,6.336623,11.038462,5.673077,1.826923,1.634615,0
4,92,2010,5,37.299316,37.331665,39.188446,38.275238,0.205065,0.414880,6.336623,106.946400,5.673077,1.826923,1.634615,23.211538,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57634,16697,2019,8,39.427277,41.711395,29.873108,28.330505,146.727780,308.896500,312.504060,310.206630,63.965520,90.206894,89.000000,85.183910,0
57635,16697,2019,9,41.711395,29.873108,28.330505,32.087708,308.896500,312.504060,310.206630,243.551130,90.206894,89.000000,85.183910,68.770120,0
57636,16697,2019,10,29.873108,28.330505,32.087708,28.918060,312.504060,310.206630,243.551130,207.554380,89.000000,85.183910,68.770120,50.137930,0
57637,16697,2019,11,28.330505,32.087708,28.918060,29.032227,310.206630,243.551130,207.554380,92.452390,85.183910,68.770120,50.137930,44.494250,0


In [45]:
class_counts = cleaned_df["outbreak"].value_counts()
print("Class Distribution:\n", class_counts)

Class Distribution:
 outbreak
0    42537
1      486
Name: count, dtype: int64


In [48]:
# rename target columns to focus on oversample of minority class
X_cln = cleaned_df.drop(
    columns=["outbreak", "location_period_id", "year"]
)  # all other columns are our feature (predicting) variables

y_cln = cleaned_df["outbreak"]  # our predicted variable

In [57]:
corr = X_cln.corr()
corr.style.background_gradient(cmap="coolwarm")

Unnamed: 0,month,lst_3,lst_2,lst_1,lst_0,precip_3,precip_2,precip_1,precip_0,sm_3,sm_2,sm_1,sm_0
month,1.0,-0.1045,-0.065577,-0.035526,-0.094534,0.300526,0.284343,0.192282,0.099152,0.21638,0.294818,0.279968,0.207214
lst_3,-0.1045,1.0,0.775714,0.564566,0.411166,-0.381419,-0.233602,-0.098464,0.009362,-0.45988,-0.328608,-0.173036,-0.053644
lst_2,-0.065577,0.775714,1.0,0.775853,0.564494,-0.439053,-0.406658,-0.256047,-0.096956,-0.49556,-0.484174,-0.353832,-0.187136
lst_1,-0.035526,0.564566,0.775853,1.0,0.775937,-0.410331,-0.471069,-0.441472,-0.269062,-0.471852,-0.522875,-0.520454,-0.382963
lst_0,-0.094534,0.411166,0.564494,0.775937,1.0,-0.364558,-0.434329,-0.501801,-0.463629,-0.416874,-0.490545,-0.553351,-0.55173
precip_3,0.300526,-0.381419,-0.439053,-0.410331,-0.364558,1.0,0.725201,0.383369,0.092259,0.765529,0.736135,0.506263,0.222829
precip_2,0.284343,-0.233602,-0.406658,-0.471069,-0.434329,0.725201,1.0,0.725504,0.3887,0.528686,0.765707,0.735691,0.503123
precip_1,0.192282,-0.098464,-0.256047,-0.441472,-0.501801,0.383369,0.725504,1.0,0.730622,0.232386,0.526666,0.765881,0.734069
precip_0,0.099152,0.009362,-0.096956,-0.269062,-0.463629,0.092259,0.3887,0.730622,1.0,-0.0299,0.234628,0.533774,0.772524
sm_3,0.21638,-0.45988,-0.49556,-0.471852,-0.416874,0.765529,0.528686,0.232386,-0.0299,1.0,0.824393,0.496532,0.165287


We observe correlation between precipitation and soil moisture values. This makes sense as one (precip) certainly has an impact on the other (soil moisture levels). We will want to consider this in our model development, as we can perhaps reduce the number of features considered. 

In [49]:
# split the dataset into train and test splits
Xcln_train, Xcln_test, ycln_train, ycln_test = train_test_split(
    X_cln, y_cln, test_size=0.3, random_state=42
)

In [50]:
# apply SMOTE to the training data with a 1:10 ratio as used by Campbell et al 2020
smote = SMOTE(sampling_strategy=0.1, random_state=42)
Xcln_resampled, ycln_resampled = smote.fit_resample(Xcln_train, ycln_train)

Applying SMOTE
- Behaves similarly to a data transformation object in that it must be defined and configured, fit on a dataset, then applied to create a new transformed version of the dataset.
- In our code below, we define a SMOTE instance with default parameters that will balance the minority class and then fit and apply it in one step to create a transformed version of our dataset. The `sampling strategy=0.1` means we will `oversample` the minority class (outbreak=1) to have 10 percent number of examples of the majority class (i.e., maintain a 1:10 ratio of outbreaks to non-outbreaks).
- Once transformed, we will expect to see the class distribution of the new transformed dataset, now to be balanced (while maintaining that 1:10 ratio) through the creation of many new synthetic examples in the minority (i.e., outbreak=1) class.

In [51]:
# check the new class distribution after SMOTE
resampled_class_counts = pd.Series(ycln_resampled).value_counts()
print("\nClass Distribution after SMOTE:\n", resampled_class_counts)


Class Distribution after SMOTE:
 outbreak
0    29758
1     2975
Name: count, dtype: int64


In [52]:
# train your machine learning model on the balanced dataset
clf_cln = RandomForestClassifier(random_state=42)
clf_cln.fit(Xcln_resampled, ycln_resampled)

In [53]:
# evaluate your model
accuracy = clf_cln.score(Xcln_test, ycln_test)
print("\nModel Accuracy on Test Set:", accuracy)


Model Accuracy on Test Set: 0.9895405593863795


### Option B: Imputation Method - Forward Fill

In [32]:
# Trying simple forward fill imputation strategy first
Xf_imputed = X.ffill()

In [10]:
# convert the result back to a DataFrame
Xf_imputed = pd.DataFrame(Xf_imputed, columns=X.columns)

In [11]:
# split the dataset into train and test splits
Xf_train, Xf_test, yf_train, yf_test = train_test_split(
    Xf_imputed, y, test_size=0.3, random_state=42
)

In [36]:
# apply SMOTE to the training data with a 1:10 ratio as used by Campbell et al 2020
smote = SMOTE(sampling_strategy=0.1, random_state=42)
Xf_resampled, yf_resampled = smote.fit_resample(Xf_train, yf_train)

In [37]:
# check the new class distribution after SMOTE
resampled_class_counts = pd.Series(yf_resampled).value_counts()
print("\nClass Distribution after SMOTE:\n", resampled_class_counts)


Class Distribution after SMOTE:
 outbreak
0    39893
1     3989
Name: count, dtype: int64


In [17]:
# train your machine learning model on the balanced dataset
clf_f = RandomForestClassifier(random_state=42)
clf_f.fit(Xf_resampled, yf_resampled)

In [18]:
# evaluate your model
accuracy = clf_f.score(Xf_test, yf_test)
print("\nModel Accuracy on Test Set:", accuracy)


Model Accuracy on Test Set: 0.987971316215591


In [19]:
# train your machine learning model on the balanced dataset (already done in the previous code)

# get feature importances from the trained RandomForestClassifier
feature_importances = clf_f.feature_importances_

# create a DataFrame to display feature names and their corresponding importances
feature_importance_df = pd.DataFrame(
    {"Feature": Xf_imputed.columns, "Importance": feature_importances}
)

# sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(
    by="Importance", ascending=False
)

# print the top N most influential features (adjust N as needed)
top_n_features = 10  # Change this to the number of top features you want to display
print(f"Top {top_n_features} Most Influential Features:")
print(feature_importance_df.head(top_n_features))

Top 10 Most Influential Features:
     Feature  Importance
8   precip_0    0.095416
5   precip_3    0.093088
7   precip_1    0.092672
6   precip_2    0.086269
12      sm_0    0.080396
4      lst_0    0.076191
11      sm_1    0.073976
1      lst_3    0.073633
9       sm_3    0.072840
3      lst_1    0.070152


### Imputation Method: Backward Fill 

In [20]:
# Trying backward fill imputation strategy first
Xb_imputed = X.bfill()

In [21]:
# convert the result back to a DataFrame
Xb_imputed = pd.DataFrame(Xb_imputed, columns=X.columns)

In [22]:
# split the dataset into train and test splits
Xb_train, Xb_test, yb_train, yb_test = train_test_split(
    Xb_imputed, y, test_size=0.3, random_state=42
)

In [23]:
# apply SMOTE to the training data with a 1:10 ratio as used by Campbell et al 2020
smote = SMOTE(sampling_strategy=0.1, random_state=42)
Xb_resampled, yb_resampled = smote.fit_resample(Xb_train, yb_train)

In [24]:
# check the new class distribution after SMOTE
resampled_class_counts = pd.Series(yb_resampled).value_counts()
print("\nClass Distribution after SMOTE:\n", resampled_class_counts)


Class Distribution after SMOTE:
 outbreak
0    39893
1     3989
Name: count, dtype: int64


In [25]:
# train your machine learning model on the balanced dataset
clf_b = RandomForestClassifier(random_state=42)
clf_b.fit(Xb_resampled, yb_resampled)

In [26]:
# evaluate your model
accuracy = clf_b.score(Xb_test, yb_test)
print("\nModel Accuracy on Test Set:", accuracy)


Model Accuracy on Test Set: 0.9878556557945871


In [27]:
# train your machine learning model on the balanced dataset (already done in the previous code)

# get feature importances from the trained RandomForestClassifier
feature_importances = clf_b.feature_importances_

# create a DataFrame to display feature names and their corresponding importances
feature_importance_df = pd.DataFrame(
    {"Feature": Xf_imputed.columns, "Importance": feature_importances}
)

# sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(
    by="Importance", ascending=False
)

# print the top N most influential features (adjust N as needed)
top_n_features = 10  # Change this to the number of top features you want to display
print(f"Top {top_n_features} Most Influential Features:")
print(feature_importance_df.head(top_n_features))

Top 10 Most Influential Features:
     Feature  Importance
8   precip_0    0.095820
5   precip_3    0.093265
7   precip_1    0.092401
6   precip_2    0.085626
12      sm_0    0.079306
11      sm_1    0.074095
3      lst_1    0.073295
4      lst_0    0.073049
1      lst_3    0.072859
9       sm_3    0.072449


TO DO: 
* Move model development to another notebook
* Reduce (redundant or correlated) features through PCA or similar
* Run sensitivity analysis on imputation strategies