In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_formats = ['retina']  # or svg
%matplotlib inline

plt.rcParams['figure.figsize'] = (9, 6)
sns.set(context='notebook', style='whitegrid', font_scale=1.2)

In [2]:
from sklearn import model_selection
from sklearn import linear_model, svm, naive_bayes, neighbors, ensemble
import pickle

# Preparation

The purpose of this notebook is really to show you a few good practices in model pipelining, and Python in general:

- Saving Models
- Pickling
- Referencing Python variables using `eval()`

We'll also use the models saved here in the next notebook.

In [3]:
# This is our arbitrary dataset for training
df = pd.read_csv('data/dataframe.csv')

X_train, X_test, y_train, y_test = model_selection.train_test_split(df.drop('label', axis=1), 
                                                                    df.label, 
                                                                    random_state=123)

In [4]:
# Define models.
lr_model = linear_model.LogisticRegression()
nb_model = naive_bayes.GaussianNB()
knn_model = neighbors.KNeighborsClassifier()
svc_model = svm.SVC(probability=True, gamma="scale")
rf_model = ensemble.RandomForestClassifier(n_estimators=100)
et_model = ensemble.ExtraTreesClassifier(n_estimators=100)
ada_model = ensemble.AdaBoostClassifier()

# Notice that these model names (strings) match the models themselves (variables, above)
models = ["lr_model", "nb_model", "knn_model", "svc_model", "rf_model", "et_model", "ada_model"]

In [5]:
import os

if not os.path.exists('models'):
    os.makedirs('models')

In [6]:
for model_name in models:
    curr_model = eval(model_name)  # The string refers us to the variable
    
    curr_model.fit(X_train, y_train)
    
    # Notice the syntax here.
    with open(f"models/{model_name}.pickle", "wb") as pfile:
        pickle.dump(curr_model, pfile)