In [2]:
#import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
from joblib import dump
from rich.console import Console
from sklearn.pipeline import make_pipeline, make_union


In [4]:
console = Console()

In [5]:
#load data
df = pd.read_csv('glass.csv')
df.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [6]:
console.log("Training data loaded.")

In [7]:
# Check the correlation.

df.corr()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
RI,1.0,-0.191885,-0.122274,-0.407326,-0.542052,-0.289833,0.810403,-0.000386,0.14301,-0.164237
Na,-0.191885,1.0,-0.273732,0.156794,-0.069809,-0.266087,-0.275442,0.326603,-0.241346,0.502898
Mg,-0.122274,-0.273732,1.0,-0.481799,-0.165927,0.005396,-0.44375,-0.492262,0.08306,-0.744993
Al,-0.407326,0.156794,-0.481799,1.0,-0.005524,0.325958,-0.259592,0.479404,-0.074402,0.598829
Si,-0.542052,-0.069809,-0.165927,-0.005524,1.0,-0.193331,-0.208732,-0.102151,-0.094201,0.151565
K,-0.289833,-0.266087,0.005396,0.325958,-0.193331,1.0,-0.317836,-0.042618,-0.007719,-0.010054
Ca,0.810403,-0.275442,-0.44375,-0.259592,-0.208732,-0.317836,1.0,-0.112841,0.124968,0.000952
Ba,-0.000386,0.326603,-0.492262,0.479404,-0.102151,-0.042618,-0.112841,1.0,-0.058692,0.575161
Fe,0.14301,-0.241346,0.08306,-0.074402,-0.094201,-0.007719,0.124968,-0.058692,1.0,-0.188278
Type,-0.164237,0.502898,-0.744993,0.598829,0.151565,-0.010054,0.000952,0.575161,-0.188278,1.0


In [8]:
#Separate out features and dependent variable.
X = pd.DataFrame(df,columns=df.columns[:-1])
X.head()
y=df['Type']

In [9]:
#Scale the features for knn as it's predictions are based on distance metric
scaler=StandardScaler()

In [10]:
scaler.fit(df.drop('Type',axis=1)) #Fit scaler on independent variable

In [11]:
StandardScaler(copy=True, with_mean=True, with_std=True)

In [12]:
scaled_features = scaler.transform(df.drop('Type',axis=1)) #Transform variabled
scaled_features

array([[ 0.87286765,  0.28495326,  1.25463857, ..., -0.14576634,
        -0.35287683, -0.5864509 ],
       [-0.24933347,  0.59181718,  0.63616803, ..., -0.79373376,
        -0.35287683, -0.5864509 ],
       [-0.72131806,  0.14993314,  0.60142249, ..., -0.82894938,
        -0.35287683, -0.5864509 ],
       ...,
       [ 0.75404635,  1.16872135, -1.86551055, ..., -0.36410319,
         2.95320036, -0.5864509 ],
       [-0.61239854,  1.19327046, -1.86551055, ..., -0.33593069,
         2.81208731, -0.5864509 ],
       [-0.41436305,  1.00915211, -1.86551055, ..., -0.23732695,
         3.01367739, -0.5864509 ]])

In [13]:
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])
df_feat.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,0.872868,0.284953,1.254639,-0.692442,-1.127082,-0.671705,-0.145766,-0.352877,-0.586451
1,-0.249333,0.591817,0.636168,-0.17046,0.102319,-0.026213,-0.793734,-0.352877,-0.586451
2,-0.721318,0.149933,0.601422,0.190912,0.438787,-0.164533,-0.828949,-0.352877,-0.586451
3,-0.232831,-0.242853,0.69871,-0.310994,-0.052974,0.112107,-0.519052,-0.352877,-0.586451
4,-0.312045,-0.169205,0.650066,-0.411375,0.555256,0.081369,-0.624699,-0.352877,-0.586451


In [14]:
#Drop these because of correlation
dff = df_feat.drop(['Ca','K'],axis=1)

In [15]:
X_train,X_test,y_train,y_test  = train_test_split(dff,df['Type'],test_size=0.3,random_state=45) #Test Train split


In [16]:
#Train the model
knn=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='uniform')

In [17]:
knn.fit(X_train,y_train)

In [18]:
console.log("ML Pipeline fitted.")

In [19]:
y_pred = knn.predict(X_test)

In [20]:
accuracy_score(y_test,y_pred)

0.7384615384615385

In [21]:
# Save the pickled object to disk.
dump(knn, 'pipe.joblib')
console.log("Joblib pickle saved.")

**However, you should be precautious about using this technique. Make sure that the file doesnt get tampered. Security should be made sure in this method.**

Check the Hash - unique identifier

In [22]:
import hashlib

def calc_checksum(path):
    md5_hash = hashlib.md5()

    with open(path, "rb") as f:
        content = f.read()
    md5_hash.update(content)
    digest = md5_hash.hexdigest()
    print(digest)

calc_checksum("pipe.joblib") # 04a415025a812c2a69cb3552d83ee275

3fa8712fa087de61760d68e99c539f91


In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_wine

X, y = load_wine(return_X_y=True)

clf = LogisticRegression(max_iter=10_000)
clf.fit(X, y)

In [24]:
import h5py

def save_coefficients(classifier, filename):
    """Save the coefficients of a linear model into a .h5 file."""
    with h5py.File(filename, 'w') as hf:
        hf.create_dataset("coef",  data=classifier.coef_)
        hf.create_dataset("intercept",  data=classifier.intercept_)
        hf.create_dataset("classes", data=classifier.classes_)

def load_coefficients(classifier, filename):
    """Attach the saved coefficients to a linear model."""
    with h5py.File(filename, 'r') as hf:
        coef = hf['coef'][:]
        intercept = hf['intercept'][:]
        classes = hf['classes'][:]
    classifier.coef_ = coef
    classifier.intercept_ = intercept
    classifier.classes_ = classes

Save these coefficients

In [25]:
save_coefficients(clf, "clf.h5")

Loading these to a new classifier

In [26]:
lr = LogisticRegression()
load_coefficients(lr, "clf.h5")

In [27]:
lr.predict(X)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

#ONNX  Model and BenchMarking

Create a text classification model with sklearn. Train it and save it.

In [45]:
import pandas as pd
from joblib import dump
from rich.console import Console

from sklearn.pipeline import make_pipeline, make_union
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

console = Console()

# Load the training data
df = pd.read_csv("clinc_oos-plus.csv").loc[lambda d: d['split'] == 'train']
console.log("Training data loaded.")

X = df['text'].to_list()
y = df['label']

# Make a very basic machine learning pipeline
pipe = make_pipeline(
    CountVectorizer(),
    LogisticRegression()
)

pipe.fit(X, y)
console.log("ML Pipeline fitted.")

# Save the pickled object to disk.
dump(pipe, 'pipe.joblib')
console.log("Joblib pickle saved.")

Build a ONNX Model from sklearn

In [49]:
!pip install skl2onnx onnxruntime



In [46]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import StringTensorType


# We will input an array with one column that is of type "string".
initial_type = [('text_input', StringTensorType([None, 1]))]
onx = convert_sklearn(pipe, initial_types=initial_type)

# This line will save the model to disk.
with open("clinc-logreg.onnx", "wb") as f:
    f.write(onx.SerializeToString())

Running Model and predict the probabilities

In [51]:
import numpy as np
import onnxruntime as rt

# First we must start a session.
sess = rt.InferenceSession("clinc-logreg.onnx")
# The name of the input is saved as part of the .onnx file.
input_name = sess.get_inputs()[0].name

# This code will run the model on our behalf.
query = "Here is the example"
_, probas = sess.run(None, {input_name: np.array([[query]])})
probas[0]

{0: 0.013282986357808113,
 1: 0.010225054807960987,
 2: 0.002407013438642025,
 3: 0.000783559400588274,
 4: 0.01327445637434721,
 5: 0.02809993363916874,
 6: 0.0005540177808143198,
 7: 0.0033848627936095,
 8: 0.0023623525630682707,
 9: 0.0038437580224126577,
 10: 0.08886969089508057,
 11: 0.0003229937865398824,
 12: 0.001362392446026206,
 13: 0.0018954119877889752,
 14: 0.004472618456929922,
 15: 0.00039712944999337196,
 16: 0.0020857974886894226,
 17: 0.0018459229031577706,
 18: 0.0017430955776944757,
 19: 0.007889136672019958,
 20: 0.0009079008596017957,
 21: 0.006852895021438599,
 22: 0.02675601653754711,
 23: 0.0004919724888168275,
 24: 0.01937558688223362,
 25: 0.009277510456740856,
 26: 0.0021638916805386543,
 27: 0.003317032242193818,
 28: 0.0006325587746687233,
 29: 0.002631316427141428,
 30: 0.034313250333070755,
 31: 0.004470761865377426,
 32: 0.002567386021837592,
 33: 0.0020687133073806763,
 34: 0.0024450612254440784,
 35: 0.0040611508302390575,
 36: 0.0027568272780627012,


**Compare both the models with the time it takes to predict**

In [48]:
import time
from joblib import load

import onnxruntime as rt
import numpy as np

sess = rt.InferenceSession("clinc-logreg.onnx")
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name

pipe = load('pipe.joblib')

text = "this is an example sentence"
n = 1000

t0 = time.time()
for i in range(n):
    pipe.predict_proba([text + str(i)])
t1 = time.time()
for i in range(n):
    _, probas = sess.run(None, {input_name: np.array([[text + str(i)]])})
t2 = time.time()

print(f"SKLEARN: {round(t1 - t0, 3)} s")
print(f"   ONNX: {round(t2 - t1, 3)} s")

SKLEARN: 1.769 s
   ONNX: 0.315 s


ONNX Model is faster comapred to model built using SKLEARN