# First Machine Learning Model 
## Imports 
### Importing libraries

In [214]:
import pandas as pd 
import numpy as np
import joblib

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, learning_curve, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import NearestNeighbors


from sklearn.base import BaseEstimator
from sklearn.metrics import r2_score, accuracy_score, precision_score, recall_score, fbeta_score, f1_score

import plotly.express as px 
import plotly.graph_objects as go 

### Importing dataframe

In [None]:
df_init = pd.read_parquet("data/datasets/base.parquet", engine="pyarrow")

In [175]:
df = df_init.copy()

In [176]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45025 entries, 0 to 67409
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SEASON        45025 non-null  object 
 1   BASIN         45025 non-null  object 
 2   NATURE        45025 non-null  object 
 3   LAT           45025 non-null  float64
 4   LON           45025 non-null  float64
 5   WIND          45025 non-null  float64
 6   DIST2LAND     45025 non-null  int64  
 7   STORM_SPEED   45025 non-null  float64
 8   STORM_DIR     45025 non-null  float64
 9   TD9636_STAGE  45025 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.8+ MB


## Model

### Encoding

The dataframe has been cleaned and only the relevant columns remain, however we need to process the colones further. 

Categorical => OneHotEncoder or one dimension with different values (1, 2, 3, 4, etc.)
- SEASON (4 classes)
- BASIN (7 classes)
- NATURE (6 classes)

Numeric => everything between 0 and 1
- LAT
- LON
- WIND 
- DIST2LAND
- STORM_SPEED
- STORM_DIR

In this notebook we will create personalized values in the categorical columns if possible to limit the dimensions to add

#### Categorical Columns

1. Seasons

In [177]:
def transorm_seasons_1(row):
    match row["SEASON"]:
        case "Winter":
            return 1
        case "Spring":
            return 0
        case "Summer":
            return -1
        case "Fall":
            return 0

def transorm_seasons_2(row):
    match row["SEASON"]:
        case "Winter":
            return 0
        case "Spring":
            return -1
        case "Summer":
            return 0
        case "Fall":
            return 1    


In [178]:
df["SEASON_1"] = df.apply(transorm_seasons_1, axis=1)
df["SEASON_2"] = df.apply(transorm_seasons_2, axis=1)

In [179]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45025 entries, 0 to 67409
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SEASON        45025 non-null  object 
 1   BASIN         45025 non-null  object 
 2   NATURE        45025 non-null  object 
 3   LAT           45025 non-null  float64
 4   LON           45025 non-null  float64
 5   WIND          45025 non-null  float64
 6   DIST2LAND     45025 non-null  int64  
 7   STORM_SPEED   45025 non-null  float64
 8   STORM_DIR     45025 non-null  float64
 9   TD9636_STAGE  45025 non-null  float64
 10  SEASON_1      45025 non-null  int64  
 11  SEASON_2      45025 non-null  int64  
dtypes: float64(6), int64(3), object(3)
memory usage: 4.5+ MB


In [180]:
df = df.drop(columns=['SEASON'])

2. Basin

In [181]:
df.BASIN.unique()

array(['SP', 'SI', 'WP', 'EP', 'NI'], dtype=object)

The BASIN categories can not be scored wich is why we use pd.get_dummies() to process the this string data

In [182]:
df = pd.get_dummies(df, columns=["BASIN"], drop_first=True)

In [183]:
df.head(3)

Unnamed: 0,NATURE,LAT,LON,WIND,DIST2LAND,STORM_SPEED,STORM_DIR,TD9636_STAGE,SEASON_1,SEASON_2,BASIN_NI,BASIN_SI,BASIN_SP,BASIN_WP
0,TS,-12.5,172.5,25.0,647,6.0,350.0,1.0,-1,0,False,False,True,False
1,TS,-12.2,172.4,25.0,653,6.0,350.0,1.0,-1,0,False,False,True,False
2,TS,-11.9,172.4,25.0,670,5.0,360.0,1.0,-1,0,False,False,True,False


3. Nature

We are sorting the NATURE categoriesthem to reflect a progression from the least informative or severe to the most complex or severe nature of cyclones.

0- NR - Not reported: This class indicates that the nature of the cyclone is not reported, so it can be considered the least informative.

1- DS - Disturbance: This class indicates a minor disturbance, which is typically the least severe form of a cyclone.

2- SS - Subtropical: Subtropical cyclones are more organized than disturbances but less severe than tropical cyclones.

3- TS - Tropical: Tropical cyclones are fully developed and more severe than subtropical cyclones.

4- ET - Extratropical: Extratropical cyclones are typically associated with frontal systems and can be very severe.

5- MX - Mixture: This class indicates contradicting nature reports from different agencies, suggesting a complex or uncertain nature, which can be considered the most severe or complex category.

In [184]:
def transorm_nature(row):
    match row["NATURE"]:
        case "NR":
            return 0
        case "DS":
            return 1
        case "SS":
            return 2
        case "TS":
            return 3
        case "ET":
            return 4
        case "MX":
            return 5

In [185]:
df["NATURE_1"] = df.apply(transorm_nature, axis=1)

In [186]:
df = df.drop(columns=['NATURE'])
df.rename(columns={"NATURE_1": "NATURE"}, inplace=True)

In [187]:
df.head()

Unnamed: 0,LAT,LON,WIND,DIST2LAND,STORM_SPEED,STORM_DIR,TD9636_STAGE,SEASON_1,SEASON_2,BASIN_NI,BASIN_SI,BASIN_SP,BASIN_WP,NATURE
0,-12.5,172.5,25.0,647,6.0,350.0,1.0,-1,0,False,False,True,False,3
1,-12.2,172.4,25.0,653,6.0,350.0,1.0,-1,0,False,False,True,False,3
2,-11.9,172.4,25.0,670,5.0,360.0,1.0,-1,0,False,False,True,False,3
3,-11.7,172.4,25.0,682,4.0,10.0,1.0,-1,0,False,False,True,False,3
4,-11.5,172.5,25.0,703,4.0,20.0,1.0,-1,0,False,False,True,False,3


### Pipeline Model

#### Splitting dataset

First we separate the columns between features and target. 
Our target is TD9636_STAGE, the rest of the columns are features we will give to the model to train. 

In [188]:
target = df["TD9636_STAGE"]

features = df.copy()
features = features.drop(columns=['TD9636_STAGE'])

Next we split the dataset into two sub-sets to obtain a train set and a test set 

In [189]:
feat_train, feat_test, target_train, target_test = train_test_split(
    features, target,
    test_size = 0.2)


#### Standardizing values

Now we can standardize the dataset. Bringing the values on the same scale ensures that each feature contributes equally to the analysis or modeling process.

In [190]:
scaler = StandardScaler()
feat_train = scaler.fit_transform(feat_train)
feat_test = scaler.transform(feat_test)

#### Pipeline model

1. Hyperparameters 

1️⃣ RandomForestClassifier (rfr)

✅ Pertinent Hyperparameters to Tune:

- n_estimators: Number of trees in the forest.
    Range: [10, 50, 100, 200, 500]
- max_depth: Maximum depth of each tree.
    Range: [None, 5, 10, 20, 50]
- min_samples_split: Minimum samples required to split a node.
    Range: [2, 5, 10]
- min_samples_leaf: Minimum samples per leaf.
    Range: [1, 2, 4]
- max_features: Number of features considered for splitting.
    Options: ["sqrt", "log2", None]
- bootstrap: Whether to use bootstrapping.
    Options: [True, False]


🔍 Details

- n_estimators improves stability (but too high slows training).
- max_depth, min_samples_split, and min_samples_leaf control overfitting.
- max_features adjusts feature selection at each split.


2️⃣ HistGradientBoostingClassifier (hgb)

✅ Pertinent Hyperparameters to Tune:

- learning_rate: Step size in boosting iterations.
    Range: [0.01, 0.1, 0.2, 0.5]
- max_iter: Number of boosting iterations (trees).
    Range: [50, 100, 200, 500]
- max_depth: Depth of trees.
    Range: [None, 5, 10, 20]
- min_samples_leaf: Minimum samples per leaf.
    Range: [1, 5, 10]
- l2_regularization: Regularization strength.
    Range: [0, 0.01, 0.1, 1]
- max_bins: Number of bins for numerical feature discretization.
    Range: [64, 128, 256]


🔍 Details

- learning_rate balances step size per iteration.
- max_iter determines number of trees.
- max_bins affects feature discretization for efficiency.
- l2_regularization helps prevent overfitting.


3️⃣ Support Vector Classifier (svc)

✅ Pertinent Hyperparameters to Tune:

- C: Regularization strength.
    Range: [0.01, 0.1, 1, 10, 100]
- kernel: Kernel type.
    Options: ["linear", "poly", "rbf", "sigmoid"]
- gamma: Kernel coefficient for 'rbf', 'poly', and 'sigmoid'.
    Range: ["scale", "auto", 0.001, 0.01, 0.1, 1]
- degree: Degree for poly kernel (ignored otherwise).
    Range: [2, 3, 4, 5]


🔍 Details

- C controls the trade-off between misclassification and margin width.
- kernel defines decision boundary complexity.
- gamma influences how far a single training example influences others.
- degree only affects polynomial kernels.


4️⃣ NearestNeighbors (knn)

✅ Pertinent Hyperparameters to Tune:

- n_neighbors: Number of neighbors to consider.
    Range: [3, 5, 10, 20]
- weights: How neighbor distances are weighted.
    Options: ["uniform", "distance"]
- algorithm: Algorithm used for nearest neighbors search.
    Options: ["auto", "ball_tree", "kd_tree", "brute"]
- leaf_size: Leaf size for ball_tree and kd_tree algorithms.
    Range: [10, 30, 50]
- p: Power parameter for Minkowski distance metric.
    Range: [1 (Manhattan), 2 (Euclidean)]


🔍 Details

- n_neighbors controls bias-variance trade-off.
- weights affects how neighbors contribute to classification.
- algorithm impacts computation time.
- p determines the distance metric.


List Recap

|Model | Hyperparameters |
| -- | -- |
| RandomForestClassifier | n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features, bootstrap |
| HistGradientBoostingClassifier | learning_rate, max_iter, max_depth, min_samples_leaf, l2_regularization, max_bins |
| SVC | C, kernel, gamma, degree |
| NearestNeighbors | n_neighbors, weights, algorithm, leaf_size, p |

2. Pipeline

In [191]:
def big_fit(X_train, y_train, X_test, y_test):

    # storing the list of estimators we want to train
    estimators = [
        ("rfr", RandomForestClassifier(random_state=42)),
        # ("hgb", HistGradientBoostingClassifier(random_state=42)),
        # ("svc", SVC()),
        # ("knn", NearestNeighbors())
    ]
    
    # initializing dict for the models' scores
    default_scores = dict()

    # we need to loop to store the scores for each model 
    # key = model name, value = score
    for estimator in estimators:
        pipe = Pipeline(steps=[estimator])
        pipe.fit(X_train, y_train)
        
        y_pred = pipe.predict(X_test)
        score = r2_score(y_test, y_pred) * 100
        name = estimator[0]
        default_scores[name] = round(score, 2)

    param_grids = [
        {
            "rfr__n_estimators": [1,10,50,100,150,200],
            "rfr__min_samples_leaf": [1, 2, 4],
            "rfr__max_depth": [5, 10, 20, 50],
            "rfr__max_features": ["sqrt", "log2", None],
        },
        # {
        #     "hgb_learning_rate": [1.01, 0.1, 0.2, 0.5],
        #     "hgb_max_iter": [50, 100, 200, 500],
        #     "hgb_max_bins": [64, 128, 256],
        #     "hgb_l2_regularization": [0, 0.01, 0.1, 1]
        # },
        # {
        #     "svc_C": [0.01, 0.1, 1, 10, 100],
        #     "svc_kernel": ["linear", "poly", "rbf", "sigmoid"],
        #     "svc_gamma": ["scale", "auto"],
        #     "svc_degree": [2, 3, 4, 5]
        # },
        # {
        #     "knn_n_neighbors": [3, 5, 10, 20],
        #     "knn_weights": ["uniform", "distance"],
        #     "knn_algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
        #     "knn_p": [1, 2]
        # },
       ]

    # dictionaries to store the different datas
    best_estimators = dict()
    best_params = dict()
    best_scores = dict()
    predict_scores = dict()

    # error rates
    accuracy_err = dict()
    precision_err = dict()
    recall_err = dict()
    fbeta_err = dict()
    f1_err = dict()
    
    # dictionary representing the distribution of errors
    err = dict()

    # scores
    feature_names = dict()
    feature_scores = dict()

    # loop to run the gridSearch for each model of our list
    for estimator, param_grid in zip(estimators, param_grids):
        print(f"estimator: {estimator}, params: {param_grid}")
        pipe_line = Pipeline(steps=[estimator])
        grid = GridSearchCV(
            pipe_line,
            param_grid,
            cv=5, 
            return_train_score=True,
            verbose=1,
        )

        name = estimator[0]
        print(f"Algo : {name}")

        # training phase with the features and target of the "train" set
        grid.fit(X_train, y_train)

        # storing best_estimator, best_params et best_scores in the corresponding dictionaries
        best_estimators[name] = grid.best_estimator_
        best_params[name] = grid.best_params_
        best_scores[name] = round(grid.best_score_ * 100, 2)
        
        # classification (prediction) test with the features and target of the "test" set
        y_pred = grid.predict(X_test)
        score = r2_score(y_test, y_pred) * 100
        predict_scores[name] = round(score, 2)


        # storing mean, median and absolute errors in the corresponding dictionaries
        # by default average is set to binary but in our case we are multi class
        # chose average = "weighted" so that it calculates the metric for each class separately, 
        # then averages them based on class frequency but we might try "micro" or "macro" instead
        accuracy_err[name] = accuracy_score(y_test, y_pred)
        precision_err[name] = precision_score(y_test, y_pred, average="weighted")
        recall_err[name] = recall_score(y_test, y_pred, average="weighted")
        fbeta_err[name] = fbeta_score(y_test, y_pred, beta=1, average="weighted")  # need to specify beta=1
        f1_err[name] = f1_score(y_test, y_pred, average="weighted")
        
        # Store classification errors (misclassified instances)
        err[name] = (y_test != y_pred).astype(int)  # 1 for misclassified, 0 for correct

        feature_names[name] = pipe_line[:-1].get_feature_names_out()

        try:
            print("coef_.flatten")
            feature_scores[name] = pipe_line.named_steps[name].coef_.flatten()
        
        except:
            print("feature_importances_.flatten")
            feature_scores[name] = pipe_line.named_steps[name].feature_importances_.flatten()
    
    return default_scores, best_scores, predict_scores, best_params, best_estimators, accuracy_err, precision_err, recall_err, fbeta_err, f1_err, err, feature_names, feature_scores

#### Running the Pipeline

In [192]:
default_scores, best_scores, predict_scores, best_params, best_estimators, accuracy_err, precision_err, recall_err, fbeta_err, f1_err, err, feature_scores, feature_names = big_fit(feat_train, target_train, feat_test, target_test)  

estimator: ('rfr', RandomForestClassifier(random_state=42)), params: {'rfr__n_estimators': [1, 10, 50, 100, 150, 200], 'rfr__min_samples_leaf': [1, 2, 4], 'rfr__max_depth': [5, 10, 20, 50], 'rfr__max_features': ['sqrt', 'log2', None]}
Algo : rfr
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
coef_.flatten
feature_importances_.flatten


### Evaluating the model

In [193]:
results = pd.DataFrame({
    "Name": [b for b in best_estimators.keys()],
    "Default score": list(default_scores.values()),
    "Grid Cross-val score": list(best_scores.values()),
    "Grid Test score": list(predict_scores.values()),
    "Accuracy": list(accuracy_err.values()),
    "Precision": list(precision_err.values()),
    "Recall": list(recall_err.values()),
    "FBeta": list(fbeta_err.values()),
    "F1": list(f1_err.values()),
})
results.sort_values(by="Grid Test score", ascending=False) 

Unnamed: 0,Name,Default score,Grid Cross-val score,Grid Test score,Accuracy,Precision,Recall,FBeta,F1
0,rfr,88.39,91.86,88.39,0.930261,0.92978,0.930261,0.928702,0.928702


Establishing which seems to be the best model

In [194]:
best_estimator = max(predict_scores, key=predict_scores.get)
best_estimator

'rfr'

In [208]:
# getting the best model's hyperparameters
best_model = best_estimators["rfr"]  
print(best_model.get_params())

{'memory': None, 'steps': [('rfr', RandomForestClassifier(max_depth=50, random_state=42))], 'transform_input': None, 'verbose': False, 'rfr': RandomForestClassifier(max_depth=50, random_state=42), 'rfr__bootstrap': True, 'rfr__ccp_alpha': 0.0, 'rfr__class_weight': None, 'rfr__criterion': 'gini', 'rfr__max_depth': 50, 'rfr__max_features': 'sqrt', 'rfr__max_leaf_nodes': None, 'rfr__max_samples': None, 'rfr__min_impurity_decrease': 0.0, 'rfr__min_samples_leaf': 1, 'rfr__min_samples_split': 2, 'rfr__min_weight_fraction_leaf': 0.0, 'rfr__monotonic_cst': None, 'rfr__n_estimators': 100, 'rfr__n_jobs': None, 'rfr__oob_score': False, 'rfr__random_state': 42, 'rfr__verbose': 0, 'rfr__warm_start': False}


In [195]:
best_estimators[best_estimator] 

#### Learning Curve

In [244]:
def compute_mean_and_standard_deviation(best_estimator, feat_train, target_train):
    # Generate learning curve data
    sizes, training_scores, testing_scores = learning_curve(
        best_estimator, feat_train, target_train, cv=10, train_sizes=np.linspace(0.01, 1.0, 50)
    )

    # Compute mean and standard deviation
    mean_training = np.mean(training_scores, axis=1)
    std_training = np.std(training_scores, axis=1)

    mean_testing = np.mean(testing_scores, axis=1)
    std_testing = np.std(testing_scores, axis=1)

    return sizes, mean_training, std_training, mean_testing, std_testing

In [None]:
sizes, mean_training, std_training, mean_testing, std_testing = compute_mean_and_standard_deviation(
    best_estimators[best_estimator],
    feat_train,
    target_train,
)

In [243]:
def plot_learning_curve(sizes, mean_training, std_training, mean_testing, std_testing, best_estimator):
    # Create figure
    fig = go.Figure()

    # Add training score line
    fig.add_trace(go.Scatter(
        x=sizes, y=mean_training,
        mode='lines', line=dict(color='blue', dash='dash'),
        name='Training Score'
    ))

    # Add shaded region for training score
    fig.add_trace(go.Scatter(
        x=np.concatenate([sizes, sizes[::-1]]),
        y=np.concatenate([mean_training - std_training, (mean_training + std_training)[::-1]]),
        fill='toself', fillcolor='rgba(0, 0, 255, 0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        name='Training Std Dev',
        showlegend=False
    ))

    # Add cross-validation score line
    fig.add_trace(go.Scatter(
        x=sizes, y=mean_testing,
        mode='lines', line=dict(color='green'),
        name='Cross-validation Score'
    ))

    # Add shaded region for testing score
    fig.add_trace(go.Scatter(
        x=np.concatenate([sizes, sizes[::-1]]),
        y=np.concatenate([mean_testing - std_testing, (mean_testing + std_testing)[::-1]]),
        fill='toself', fillcolor='rgba(0, 255, 0, 0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        name='Validation Std Dev',
        showlegend=False
    ))

    # Update layout
    fig.update_layout(
        title=f"Learning Curve for {best_estimator} Model",
        xaxis_title="Training Set Size",
        yaxis_title="Score",
        legend=dict(x=0, y=1),
        template="plotly_white"
    )

    # Show plot
    fig.show()


In [197]:
plot_learning_curve(sizes, mean_training, std_training, mean_testing, std_testing, best_estimator)

#### Error Distribution

In [242]:
def plot_distribution(err_best_estimator, best_estimator):
    fig = go.Figure()

    fig.add_trace(go.Histogram(
        x=err_best_estimator, 
        nbinsx=2,  # Only two possible values (0 and 1)
        marker=dict(color='blue', line=dict(color='black', width=1)),
        opacity=0.75
    ))

    fig.update_layout(
        title=f"Error Distribution of {best_estimator}",
        xaxis_title="Misclassification (0 = Correct, 1 = Incorrect)",
        yaxis_title="Frequency",
        bargap=0.1,
        template="plotly_white"
    )

    fig.show()

In [205]:
plot_distribution(err[best_estimator], best_estimator)

### Storing weights

In [241]:
def save_best_model_weights(dataset_name, best_estimator, best_estimator_name):
    # Save the best model
    joblib.dump(best_estimator, f"data/weights/{dataset_name}_best_model_{best_estimator_name}.pkl")

In [None]:
save_best_model_weights("base", best_estimator=best_estimators[best_estimator],best_estimator_name=best_estimator)

['data/weights/base_best_model_rfr.pkl']

## Augmenting the dataset 

In [None]:
feat_train_copy = feat_train.copy()
target_train_copy = target_train.copy()

In [239]:
def plot_distribution_target(dataframe, target):
    fig = px.histogram(dataframe, x="TD9636_STAGE", color="TD9636_STAGE", opacity=0.75)

    fig.update_layout(
    title_text=f"Histogram of the TD9636_STAGE of the Cyclone", # title of plot
    xaxis_title_text="TD9636_STAGE", # x axis label
    yaxis_title_text="Count", # y axis label
    )

    fig.show()

In [225]:
plot_distribution_target(target_train_copy, "TD9636_STAGE")

Let's see what happens if we try to balance the dataset more by augmenting the data for the stages under-represented.

We will use the data augmentation techniques SMOTE to generate new, meaningful samples.
SMOTE (Synthetic Minority Over-sampling Technique): Generates synthetic samples by interpolating between existing samples.

In [226]:
smote = SMOTE(sampling_strategy="auto", random_state=42, )
feat_resampled, target_resampled = smote.fit_resample(feat_train_copy, target_train_copy)

In [227]:
plot_distribution_target(target_resampled, "TD9636_STAGE")

#### Runnning Pipeline

In [229]:
default_scores_2, best_scores_2, predict_scores_2, best_params_2, best_estimators_2, accuracy_err_2, precision_err_2, recall_err_2, fbeta_err_2, f1_err_2, err_2, feature_scores_2, feature_names_2 = big_fit(feat_resampled, target_resampled, feat_test, target_test)  

estimator: ('rfr', RandomForestClassifier(random_state=42)), params: {'rfr__n_estimators': [1, 10, 50, 100, 150, 200], 'rfr__min_samples_leaf': [1, 2, 4], 'rfr__max_depth': [5, 10, 20, 50], 'rfr__max_features': ['sqrt', 'log2', None]}
Algo : rfr
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
coef_.flatten
feature_importances_.flatten


#### Evaluate model

In [230]:
results = pd.DataFrame({
    "Name": [b for b in best_estimators_2.keys()],
    "Default score": list(default_scores_2.values()),
    "Grid Cross-val score": list(best_scores_2.values()),
    "Grid Test score": list(predict_scores_2.values()),
    "Accuracy": list(accuracy_err_2.values()),
    "Precision": list(precision_err_2.values()),
    "Recall": list(recall_err_2.values()),
    "FBeta": list(fbeta_err_2.values()),
    "F1": list(f1_err_2.values()),
})
results.sort_values(by="Grid Test score", ascending=False) 

Unnamed: 0,Name,Default score,Grid Cross-val score,Grid Test score,Accuracy,Precision,Recall,FBeta,F1
0,rfr,88.06,97.05,88.33,0.930927,0.931472,0.930927,0.931029,0.931029


In [231]:
best_estimator_2 = max(predict_scores, key=predict_scores.get)

In [232]:
# getting the best model's hyperparameters
best_model_2 = best_estimators_2["rfr"]  
print(best_model_2.get_params())

{'memory': None, 'steps': [('rfr', RandomForestClassifier(max_depth=50, n_estimators=200, random_state=42))], 'transform_input': None, 'verbose': False, 'rfr': RandomForestClassifier(max_depth=50, n_estimators=200, random_state=42), 'rfr__bootstrap': True, 'rfr__ccp_alpha': 0.0, 'rfr__class_weight': None, 'rfr__criterion': 'gini', 'rfr__max_depth': 50, 'rfr__max_features': 'sqrt', 'rfr__max_leaf_nodes': None, 'rfr__max_samples': None, 'rfr__min_impurity_decrease': 0.0, 'rfr__min_samples_leaf': 1, 'rfr__min_samples_split': 2, 'rfr__min_weight_fraction_leaf': 0.0, 'rfr__monotonic_cst': None, 'rfr__n_estimators': 200, 'rfr__n_jobs': None, 'rfr__oob_score': False, 'rfr__random_state': 42, 'rfr__verbose': 0, 'rfr__warm_start': False}


In [233]:
best_estimators_2[best_estimator_2] 

In [237]:
sizes_2, mean_training_2, std_training_2, mean_testing_2, std_testing_2 = compute_mean_and_standard_deviation(
    best_estimators[best_estimator],
    feat_resampled,  
    target_resampled,
)

#### Learning curve

In [245]:
plot_learning_curve(sizes_2, mean_training_2, std_training_2, mean_testing_2, std_testing_2, best_estimator_2)

#### Distribution

In [246]:
plot_distribution(err_2[best_estimator_2], best_estimator_2)

### Storing weights

In [247]:
save_best_model_weights(
    dataset_name="base_augmented", 
    best_estimator=best_estimators_2[best_estimator_2],
    best_estimator_name=best_estimator_2
)