In [5]:
import imblearn
import pandas as pd

from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [6]:
full_df = pd.read_csv("../data/zonal-means-aggregate-200910-201912.csv")
full_df

Unnamed: 0,location_period_id,year,month,lst_3,lst_2,lst_1,lst_0,precip_3,precip_2,precip_1,precip_0,sm_3,sm_2,sm_1,sm_0,outbreak
0,92,2010,1,32.357544,33.726624,34.437530,35.159393,288.192780,61.916030,2.243044,0.277970,75.846150,55.673077,25.692308,11.038462,0
1,92,2010,2,33.726624,34.437530,35.159393,37.299316,61.916030,2.243044,0.277970,0.205065,55.673077,25.692308,11.038462,5.673077,0
2,92,2010,3,34.437530,35.159393,37.299316,37.331665,2.243044,0.277970,0.205065,0.414880,25.692308,11.038462,5.673077,1.826923,0
3,92,2010,4,35.159393,37.299316,37.331665,39.188446,0.277970,0.205065,0.414880,6.336623,11.038462,5.673077,1.826923,1.634615,0
4,92,2010,5,37.299316,37.331665,39.188446,38.275238,0.205065,0.414880,6.336623,106.946400,5.673077,1.826923,1.634615,23.211538,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57634,16697,2019,8,39.427277,41.711395,29.873108,28.330505,146.727780,308.896500,312.504060,310.206630,63.965520,90.206894,89.000000,85.183910,0
57635,16697,2019,9,41.711395,29.873108,28.330505,32.087708,308.896500,312.504060,310.206630,243.551130,90.206894,89.000000,85.183910,68.770120,0
57636,16697,2019,10,29.873108,28.330505,32.087708,28.918060,312.504060,310.206630,243.551130,207.554380,89.000000,85.183910,68.770120,50.137930,0
57637,16697,2019,11,28.330505,32.087708,28.918060,29.032227,310.206630,243.551130,207.554380,92.452390,85.183910,68.770120,50.137930,44.494250,0


In [None]:
class_counts = full_df["outbreak"].value_counts()
print("Class Distribution:\n", class_counts)

In [None]:
# rename target columns to focus on oversample of minority class
X = full_df.drop(columns=["outbreak"])
y = full_df["outbreak"]

The SMOTE algorithm for treating imbalanced datasets cannot deal with missing values (NaNs) for Feature columns, so we need to impute the missing data

In [None]:
# use SimpleImputer to impute missing values (replace NaN with the mean of the column)
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)

# NOTE: this is very BASIC imputation approach - and is not ideal, will need to create
# separate notebook walking through imputation of environmental parameters

In [None]:
# convert the result back to a DataFrame
X_imputed = pd.DataFrame(X_imputed, columns=X.columns)

In [None]:
# split the dataset into train and test splits
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y, test_size=0.3, random_state=42
)

In [None]:
# apply SMOTE to the training data with a 1:10 ratio as used by Campbell et al 2020
smote = SMOTE(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# check the new class distribution after SMOTE
resampled_class_counts = pd.Series(y_resampled).value_counts()
print("\nClass Distribution after SMOTE:\n", resampled_class_counts)

In [None]:
# train your machine learning model on the balanced dataset
clf = RandomForestClassifier(random_state=42)
clf.fit(X_resampled, y_resampled)

In [None]:
# evaluate your model
accuracy = clf.score(X_test, y_test)
print("\nModel Accuracy on Test Set:", accuracy)

In [None]:
# train your machine learning model on the balanced dataset (already done in the previous code)

# get feature importances from the trained RandomForestClassifier
feature_importances = clf.feature_importances_

# create a DataFrame to display feature names and their corresponding importances
feature_importance_df = pd.DataFrame(
    {"Feature": X_imputed.columns, "Importance": feature_importances}
)

# sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(
    by="Importance", ascending=False
)

# print the top N most influential features (adjust N as needed)
top_n_features = 10  # Change this to the number of top features you want to display
print(f"Top {top_n_features} Most Influential Features:")
print(feature_importance_df.head(top_n_features))

ToDo: 
* Year and locationID are still included as Features and they shouldn't be, so will want to consider dropping from dataframe
* Will need to determine more appropriate ways to impute missing data
* Re-run all of the above