In [None]:
import pickle
import warnings
warnings.filterwarnings("ignore")

import pandas as pd

from sklearn.model_selection import train_test_split

# Classification models
from sklearn.linear_model import LogisticRegression as c_lm
from sklearn.neighbors import KNeighborsClassifier as c_knn
from sklearn.tree import DecisionTreeClassifier as c_dt
from sklearn.ensemble import RandomForestClassifier as c_rf, GradientBoostingClassifier as c_gbf
from sklearn.svm import SVC as c_svm
from sklearn.neural_network import MLPClassifier as c_mlp

# Hyperparam tuning
from sklearn.model_selection import GridSearchCV

# Model evaluation
from ml_pipeline.model_evaluation import evaluate_classification

In [None]:
# Import data, perform SAME (match random state) train/test split
path = "./data/supply_chain.csv"
data = pd.read_csv(path)

# Split target from data
target = 'received_on_time'

# Drop records w/o target (can't train if no target)
data.dropna(subset = [target], inplace=True)

# Drop duplicates
data.drop_duplicates(inplace=True)

# Separate target from the rest of the data
cols = list(data.columns)
cols.remove(target)

# Define dependent and independent variables
y = data[target]
X = data[cols]

# Immediately train test split
# ! THE RANDOM SEED MUST MATCH WHAT YOU DID DURING DATA PREP!!!
x_tr, x_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Import pickle and transform data
with open("./pickles/data_pipeline.pickle", 'rb') as handle:
    pipe = pickle.load(handle)
    x_tr = pipe.transform(x_tr)
    x_te = pipe.transform(x_te)

x_tr.head()

### Modelling (Usually, you'd build several models, do some hyperparametric tuning, etc.)

In [None]:
# Instantiate    
    # Hyperparameters:
        # n_estimators: integer
        # max_depth: integer
gbf_model = c_gbf(n_estimators = 50, max_depth = 2)

# Train the model 
gbf_model.fit(x_tr, y_tr)

# Generate Predictions
gbf_tr = gbf_model.predict(x_tr)
gbf_te = gbf_model.predict(x_te)

### Evaluate

In [None]:
# Evaluate
metrics, blindcm, traincm, testcm = evaluate_classification(y_tr, gbf_tr, y_te, gbf_te)
metrics

In [None]:
# Training set confusion matrix
traincm.confusion_matrix

In [None]:
traincm.confusion_matrix_rel

In [None]:
# Test Set confusion matrix
testcm.confusion_matrix_rel

In [None]:
# Special ability if using random forest or gradient boosted forest

import matplotlib.pyplot as plt
import seaborn as sns

lbl = list(gbf_model.feature_names_in_)
imp = list(gbf_model.feature_importances_)
impdf = pd.DataFrame({'variable':lbl,'importance':imp})
impdf.sort_values('importance', ascending=False, inplace=True)

fig = plt.figure(figsize = (12,12))
sns.barplot(x=impdf['importance'], y=impdf['variable'])

### Finalize model (train on ENTIRE data set, save the model as a pickle)

In [None]:
# Concatenate full data set (we do this to train the ML model on ALL our data so it has more data to learn from)
X = pd.concat([x_tr, x_te], axis = 0, ignore_index = True)
y = pd.concat([y_tr, y_te], axis = 0, ignore_index = True)

# We use the same exact hyperparameters from tuning
gbf_model = c_gbf(n_estimators = 50, max_depth = 2)

# Train the model 
gbf_model.fit(X, y)

In [None]:
# Pickle and save the model
with open("./pickles/supply_chain_classifier.pickle", 'wb') as handle:
    pickle.dump(gbf_model, handle)