# 1 - Setup

In [8]:
#   data science packages
from datetime import datetime
import numpy as np
import pandas as pd
import joblib

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

# 2 - Import Data

In [5]:
# random array for testing
nrows = 100
ncols = 20

df = pd.DataFrame(
    data=np.random.randint(low=0, high=100, size=(nrows, ncols), dtype='int64'),
    columns=[f"col_{x:0>2}" for x in range(ncols)]
)

df.head()

Unnamed: 0,col_00,col_01,col_02,col_03,col_04,col_05,col_06,col_07,col_08,col_09,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19
0,48,99,38,58,14,97,43,69,63,34,59,38,60,39,8,16,2,40,83,56
1,11,29,50,46,54,76,62,79,75,83,62,95,77,23,75,67,61,4,7,15
2,22,48,2,58,15,15,85,77,38,96,23,49,31,57,3,28,72,14,9,38
3,57,65,20,49,14,49,0,99,40,76,51,63,74,9,21,18,31,54,27,51
4,55,76,90,54,18,11,39,35,53,90,16,91,77,33,26,43,2,6,18,96


# 3 - Start Model Pipeline

In [None]:
ordinate_columns = [f"col_{x:0>2}" for x in range(10)]
categorical_columns = [f"col_{x:0>2}" for x in range(10, len(df.columns))]

ordinate_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])

categorical_pipeline = Pipeline([
    ('one_hot_enc', OneHotEncoder()),
])

prep_pipeline = ColumnTransformer([
    ("ordinate", ordinate_pipeline, ordinate_columns),
    ("categorical", categorical_pipeline, categorical_columns),
])

# 4 - Run Model Pipeline

In [None]:
# train/test split
#   assumes the target feature is named `class`
X_train, X_test, Y_train, Y_test = train_test_split(
    df.drop(['class']), df['class'],
    test_size=0.20, 
    random_state=3
)

In [None]:
# execute the preprocess pipeline
df_preprocessed = prep_pipeline.fit_transform(df)

# 5 - Fit Model

In [None]:
rand_forest_model = RandomForestClassifier(
    # hyperparameters
)

rand_forest_model.fit(X_train, Y_train)

In [12]:
# save a copy
now = datetime.now()
filename = f"rf_model_{now.strftime(r'%Y%m%d_%H%M')}.joblib_model"
model_filename = f"../data/models/{filename}"

joblib.dump(value=rand_forest_model, filename=filename)

# 6 - Evaluate Model

In [None]:
# load the pre-trained copy in
filepath = "../data/models/"
filename = "rf_model_20230202_2252.joblib"
rand_forest_model = joblib.load(f"{filepath}{filename}")

In [None]:
# pre-process X_test using pipeline that was previously fit onto X_train
X_test_preprocessed = prep_pipeline.transform(X_test)

# make predictions
Y_predicted = rand_forest_model.predict(X_test_preprocessed)