In [None]:
import imblearn
import pandas as pd

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [None]:
full_df = pd.read_csv("../data/zonal-means-aggregate-200910-201912.csv")
full_df

In [None]:
class_counts = full_df["outbreak"].value_counts()
print("Class Distribution:\n", class_counts)

In [None]:
# rename target columns to focus on oversample of minority class
X = full_df.drop(columns=["outbreak", "location_period_id", "year"])
y = full_df["outbreak"]

The SMOTE algorithm for treating imbalanced datasets cannot deal with missing values (NaNs) for Feature columns, so we need to impute the missing data

In [None]:
# Trying simplist forward fill imputation strategy first
X_imputed = X.ffill()

In [None]:
# convert the result back to a DataFrame
X_imputed = pd.DataFrame(X_imputed, columns=X.columns)

In [None]:
# split the dataset into train and test splits
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y, test_size=0.3, random_state=42
)

In [None]:
# apply SMOTE to the training data with a 1:10 ratio as used by Campbell et al 2020
smote = SMOTE(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# check the new class distribution after SMOTE
resampled_class_counts = pd.Series(y_resampled).value_counts()
print("\nClass Distribution after SMOTE:\n", resampled_class_counts)

In [None]:
# train your machine learning model on the balanced dataset
clf = RandomForestClassifier(random_state=42)
clf.fit(X_resampled, y_resampled)

In [None]:
# evaluate your model
accuracy = clf.score(X_test, y_test)
print("\nModel Accuracy on Test Set:", accuracy)

In [None]:
# train your machine learning model on the balanced dataset (already done in the previous code)

# get feature importances from the trained RandomForestClassifier
feature_importances = clf.feature_importances_

# create a DataFrame to display feature names and their corresponding importances
feature_importance_df = pd.DataFrame(
    {"Feature": X_imputed.columns, "Importance": feature_importances}
)

# sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(
    by="Importance", ascending=False
)

# print the top N most influential features (adjust N as needed)
top_n_features = 10  # Change this to the number of top features you want to display
print(f"Top {top_n_features} Most Influential Features:")
print(feature_importance_df.head(top_n_features))

TO DO: 
* Move model development to another notebook
* Reduce (redundant or correlated) features through PCA or similar
* Run sensitivity analysis on imputation strategies