# Module 03

## Session 14 Machine Learning Implementation

# Library

In [1]:
# Basic Operations
import pandas as pd
import numpy as np

# ML Models
from sklearn.linear_model import LogisticRegression

# Feature Engineering
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer

# Evaluation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Model
import pickle
import joblib

# Data

In [6]:
wine_label = pd.read_csv("datasets/wine_train.csv")
wine_label.head()

Unnamed: 0,alcohol,density,fixed acidity level,chlorides level,label
0,8.8,1.001,medium,low,0
1,9.5,0.994,low,low,0
2,10.1,0.9951,high,low,0
3,9.9,0.9956,medium,medium,0
4,9.9,0.9956,medium,medium,0


# Preprocessing

In [8]:
# Preprocessing
poly = PolynomialFeatures(degree=3)
one_hot = OneHotEncoder(drop='first')

transformer = ColumnTransformer(
    [
        ('poly', poly, ['alcohol', 'density']),
        ('one_hot', one_hot, ['fixed acidity level', 'chlorides level'])
    ]
)

# data
X = wine_label[['alcohol', 'density', 'fixed acidity level', 'chlorides level']]
y = wine_label['label']

# model selection / optimization
model = LogisticRegression()
estimator = Pipeline(
    [
        ('preprocess',transformer),
        ('clf',model),
    ]
)

skfold = StratifiedKFold(n_splits=5)

param_grid = {
    'clf__C':[100,10,1,0.1,0.01,0.001],
    'clf__solver':['liblinier', 'newton-cg']
}

grid_search = GridSearchCV(
    estimator,
    param_grid=param_grid,
    cv=skfold,
    scoring='f1',
    n_jobs=-1
)

grid_search.fit(X,y)

Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/boost/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/boost/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/boost/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/boost/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 434, in _check_solver
    raise ValueError("Logistic Regression supports only solvers in %s, got"
ValueError: Logistic Regression supports only solvers in ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'], got liblinier.

Traceback (most recent 

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('preprocess',
                                        ColumnTransformer(transformers=[('poly',
                                                                         PolynomialFeatures(degree=3),
                                                                         ['alcohol',
                                                                          'density']),
                                                                        ('one_hot',
                                                                         OneHotEncoder(drop='first'),
                                                                         ['fixed '
                                                                          'acidity '
                                                                          'level',
                                                                          

# PICKLE

In [10]:
# save model
filename = 'Model.sav'
pickle.dump(grid_search.best_estimator_, open(filename,'wb'))

In [11]:
# load model
filename = 'Model.sav'
loaded_model = pickle.load(open(filename,'rb'))

In [15]:
loaded_model

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('poly',
                                                  PolynomialFeatures(degree=3),
                                                  ['alcohol', 'density']),
                                                 ('one_hot',
                                                  OneHotEncoder(drop='first'),
                                                  ['fixed acidity level',
                                                   'chlorides level'])])),
                ('clf', LogisticRegression(C=10, solver='newton-cg'))])

In [13]:
df_predict = pd.DataFrame(
    {
        'alcohol':[10],
        'density':[0.999],
        'fixed acidity level':['high'],
        'chlorides level':['low']
    }
)

In [14]:
loaded_model.predict(df_predict)

array([0])

In [16]:
loaded_model.predict_proba(df_predict)

array([[0.67577088, 0.32422912]])

# JOBLIB

In [17]:
joblib.dump(grid_search.best_estimator_, "model joblib")

['model joblib']

In [18]:
loaded_model_joblib = joblib.load("model joblib")

In [19]:
loaded_model_joblib

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('poly',
                                                  PolynomialFeatures(degree=3),
                                                  ['alcohol', 'density']),
                                                 ('one_hot',
                                                  OneHotEncoder(drop='first'),
                                                  ['fixed acidity level',
                                                   'chlorides level'])])),
                ('clf', LogisticRegression(C=10, solver='newton-cg'))])

In [20]:
loaded_model_joblib.predict(df_predict)

array([0])

In [21]:
loaded_model_joblib.predict_proba(df_predict)

array([[0.67577088, 0.32422912]])