# Random Forest Machine Learning Model 


obenob
new data new model 


## Imports 
### Importing libraries

In [46]:
import pandas as pd 
import numpy as np
import joblib
import pprint

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import (
    StandardScaler, 
    OrdinalEncoder, 
    OneHotEncoder
)

from sklearn.compose import ColumnTransformer

from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import (
    r2_score, 
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    make_scorer,
)


### Importing dataframe

In [47]:
df_init = pd.read_parquet("../../../data/df_app.parquet", engine="pyarrow")

In [48]:
df = df_init.copy()

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45025 entries, 0 to 67409
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   SID           45025 non-null  object        
 1   ISO_TIME      45025 non-null  datetime64[ns]
 2   SEASON        45025 non-null  object        
 3   BASIN         45025 non-null  object        
 4   NATURE        45025 non-null  object        
 5   LAT           45025 non-null  float64       
 6   LON           45025 non-null  float64       
 7   WIND          45025 non-null  float64       
 8   DIST2LAND     45025 non-null  int64         
 9   STORM_SPEED   45025 non-null  float64       
 10  STORM_DIR     45025 non-null  float64       
 11  TD9636_STAGE  45025 non-null  float64       
dtypes: datetime64[ns](1), float64(6), int64(1), object(4)
memory usage: 4.5+ MB


We need to separate the data that will be given to the machine learning model to train and test from the data that will be used in the application so that it is "new data never seen before". 

### Isolating data

We need to drop some rows so that the model is not test/trained on them and so that they are not inserted into the small database in the app from the get-go. We are dropping :

- a random cyclone of stage 0 (so that the model never trained on any data from that cyclone)
- a random row from a random cyclone of stage 4
- a random row from a random cyclone of stage 5

In [50]:
# stage 0
stage_0_cyclones = df[df['TD9636_STAGE'] == 0]
random_cyclone_id_0 = np.random.choice(stage_0_cyclones['SID'].unique())

# store dropped rows
dropped_df = df[df["SID"] == random_cyclone_id_0]
# immediately drop the cyclone from the dataset so that the next randoms don't come from the same cyclone
df = df[df["SID"] != random_cyclone_id_0]

# stage 4
stage_4_cyclones = df[df['TD9636_STAGE'] == 4]
random_row_4 = stage_4_cyclones.sample(n=1).index.item()

dropped_df = pd.concat([dropped_df,  df[df.index == random_row_4]])
df = df.drop(index=random_row_4)

# stage 5
stage_5_cyclones = df[df['TD9636_STAGE'] == 5]
random_row_5 = stage_5_cyclones.sample(n=1).index.item()

dropped_df = pd.concat([dropped_df,  df[df.index == random_row_5]])
df = df.drop(index=random_row_5)


In [51]:
dropped_df.isnull().sum().sort_values()

SID             0
ISO_TIME        0
SEASON          0
BASIN           0
NATURE          0
LAT             0
LON             0
WIND            0
DIST2LAND       0
STORM_SPEED     0
STORM_DIR       0
TD9636_STAGE    0
dtype: int64

In [52]:
dropped_df

Unnamed: 0,SID,ISO_TIME,SEASON,BASIN,NATURE,LAT,LON,WIND,DIST2LAND,STORM_SPEED,STORM_DIR,TD9636_STAGE
2014,1980133N07149,1980-05-11 12:00:00,Spring,WP,MX,6.9,148.5,15.0,1015,6.0,270.0,0.0
2015,1980133N07149,1980-05-11 15:00:00,Spring,WP,MX,6.9,148.2,15.0,1012,6.0,270.0,0.0
2016,1980133N07149,1980-05-11 18:00:00,Spring,WP,MX,6.9,147.9,15.0,1010,5.0,275.0,0.0
2017,1980133N07149,1980-05-11 21:00:00,Spring,WP,MX,7.0,147.7,18.0,1020,5.0,280.0,0.0
2018,1980133N07149,1980-05-12 00:00:00,Spring,WP,MX,7.0,147.4,20.0,1019,6.0,275.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2095,1980133N07149,1980-05-21 15:00:00,Spring,WP,TS,35.2,142.9,33.0,189,33.0,45.0,2.0
2096,1980133N07149,1980-05-21 18:00:00,Spring,WP,MX,36.4,144.2,30.0,292,26.0,45.0,5.0
2097,1980133N07149,1980-05-21 21:00:00,Spring,WP,MX,37.1,145.3,72.8,359,26.0,45.0,5.0
59294,1988310S08100,1988-11-13 06:00:00,Spring,SI,TS,-8.6,84.7,65.0,1687,9.0,230.0,4.0


In [53]:
df[df["SID"] == "1980225N11145"]

Unnamed: 0,SID,ISO_TIME,SEASON,BASIN,NATURE,LAT,LON,WIND,DIST2LAND,STORM_SPEED,STORM_DIR,TD9636_STAGE
3432,1980225N11145,1980-08-14 18:00:00,Summer,WP,MX,14.4,125.1,20.0,94,17.0,295.0,0.0
3433,1980225N11145,1980-08-14 21:00:00,Summer,WP,MX,14.8,124.4,20.0,90,16.0,295.0,0.0
3434,1980225N11145,1980-08-15 00:00:00,Summer,WP,MX,15.2,123.6,20.0,134,14.0,295.0,1.0
3435,1980225N11145,1980-08-15 03:00:00,Summer,WP,MX,15.4,123.0,18.0,123,11.0,295.0,1.0
3436,1980225N11145,1980-08-15 06:00:00,Summer,WP,MX,15.7,122.6,15.0,69,8.0,300.0,6.0
3437,1980225N11145,1980-08-15 09:00:00,Summer,WP,TS,15.8,122.3,28.0,39,9.0,300.0,6.0


In [54]:
df.to_parquet("../data/dataframe/dataframe.parquet", engine="pyarrow")

## Model

### Encoding Functions

#### Categorical Columns

1. Seasons

SEASON is encoded using OrdinalEncoder() from Scikit-learn. OrdinalEncoder() assigns a unique integer to each category in a feature, creating an ordinal relationship between the categories, giving more weight to seasons with more frequent and stronger storms (Winter < Spring < Fall < Summer). 

2. Basin & Nature

NATURE AND BASIN columns are encoded with the OneHotEncoder() class from the pandas library. This is a function used for one-hot encoding of categorical variables. It converts categorical data into dummy or indicator variables, creating new columns for each unique category with binary values (0 or 1) to indicate the presence or absence of that category in each row.  

3. Storm direction

STORM_DIR represents a direction expressed in degrees; hence the variable was scaled by extracting the X and Y vectors of the direction using numpy’s cosinus and sinus methods. As this preprocessing is personalized it needs to be defined in the new class StormDirTransformer(). 

In [55]:
class StormDirTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, col="STORM_DIR"):
        self.col = col

    def fit(self, X, y=None):
        return self  # Nothing to fit

    def transform(self, X):
        x = np.cos(np.radians(X[self.col]))
        y = np.sin(np.radians(X[self.col]))
        return np.c_[x, y]  # Returns a 2D array
    
    def get_feature_names_out(self, input_features=None):
        return [f"x", f"y"]

### Storing weights

In [56]:
def save_model_weights(dataset_name, model, model_name):
    # Save the best model
    joblib.dump(model, f"../data/weights/{dataset_name}_model_{model_name}.pkl")

### Pipeline Model

In [57]:
def model_pipeline(dataframe: pd.DataFrame, classifier, preprocessor, balance: bool = False, cv_folds: int = 5):
    # Define features and target
    target = dataframe["TD9636_STAGE"]
    features = dataframe.drop(columns=['TD9636_STAGE'])

    print("Running through the pipeline with cross-validation")

    # Use StratifiedKFold for better class distribution in folds
    #  KFold, RepeatedStratifiedKFold, StratifiedKFold
    # cv = KFold(n_splits=5, shuffle=True, random_state=42)
    cv = RepeatedStratifiedKFold(n_splits=cv_folds, random_state=42)
    
    # Define scoring metrics
    scoring = {
        "accuracy": make_scorer(accuracy_score),
        "precision": make_scorer(precision_score, average="weighted"),
        "recall": make_scorer(recall_score, average="weighted"),
        "f1": make_scorer(f1_score, average="weighted"),
        "r2_score": make_scorer(r2_score),
    }

    if balance:
        print("Using SMOTE for balancing")
        pipe = ImbPipeline(steps=[
            ("preprocessor", preprocessor),
            ("smote", SMOTE(sampling_strategy="auto", random_state=42)),
            ("classifier", classifier)
        ])
    else:
        pipe = Pipeline(steps=[
            ("preprocessor", preprocessor),
            ("classifier", classifier)
        ])

    # Perform cross-validation
    
    cv_results = cross_validate(pipe, features, target, cv=cv, scoring=scoring, return_estimator=True)

    # Calculate mean scores
    scores = {metric: round(cv_results[f"test_{metric}"].mean(), 4) for metric in scoring}

    # Print results
    print("---- CROSS-VALIDATION RESULTS ----")
    pprint.pprint(scores)

    return cv_results, scores


In [58]:
preprocessor = ColumnTransformer(
    transformers=[
        ("ord", OrdinalEncoder(categories=[['Winter', 'Spring', 'Fall', 'Summer']]), ["SEASON"]),
        ("storm_dir_transform", StormDirTransformer(), ["STORM_DIR"]),
        ("num", StandardScaler(), ["WIND", "DIST2LAND", "STORM_SPEED"]),  
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), ["BASIN", "NATURE"])  
    ],  
    remainder="passthrough"
)

In [59]:
# best model established with gridSearch for balanced (augmented) dataset 
estimator = HistGradientBoostingClassifier(l2_regularization=0.1, learning_rate=0.2,max_bins=128, max_iter=500, random_state=42)

# estimator = KNeighborsClassifier(n_neighbors=2)
# estimator = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=5)

We drop the SID column to train the model

In [60]:
df.drop(columns=["SID"], inplace=True)
df.drop(columns=["ISO_TIME"], inplace=True)

In [61]:
cv_results, scores = model_pipeline(df, estimator, preprocessor, balance=True)

Running through the pipeline with cross-validation
Using SMOTE for balancing
---- CROSS-VALIDATION RESULTS ----
{'accuracy': np.float64(0.9268),
 'f1': np.float64(0.9268),
 'precision': np.float64(0.927),
 'r2_score': np.float64(0.8848),
 'recall': np.float64(0.9268)}


In [62]:
# Get the index of the best model based on accuracy (or another metric)
best_index = np.argmax(cv_results["test_accuracy"])  # Using accuracy as the criterion
# Retrieve the best model
best_model = cv_results["estimator"][best_index]

save_model_weights("base_augmented", best_model, "histGradientBoost")