## Import dependencies

In [1]:
from pathlib import Path
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

data_folder = Path(globals()['_dh'][0]) / ".." / "data"
output_folder = Path(globals()['_dh'][0]) / "output"
output_folder.mkdir(exist_ok=True)

## Initial look at the data

It is a labeled list of short text fragments in 17 different languages. These are some examples of the data:

In [2]:
data = pd.read_csv(data_folder / "LanguageDetection.csv", sep=",")
data.sample(5)

Unnamed: 0,Text,Language
5250,"no te olvides de decir, ¿quién es ese tipo afa...",Spanish
9271,افعل ذلك شاكرا لكم مقدما.,Arabic
1618,എന്റെ പ്ലേറ്റിൽ വളരെയധികം കാര്യങ്ങൾ ഞാൻ ജോലിയി...,Malayalam
3657,"Ainsi, en juin 2009, le philosophe français Be...",French
2098,"உதாரணமாக, ஆங்கிலப் பதிப்பில், பதிவுசெய்த பயனர்...",Tamil


The distribution is of languages is relatively equal:

In [3]:
samples = data.groupby(by=["Language"]).count()["Text"].sort_values()

print("Number of samples by language:")
pd.DataFrame({"count": samples, "percent": (samples/sum(samples)).round(3)*100})

Number of samples by language:


Unnamed: 0_level_0,count,percent
Language,Unnamed: 1_level_1,Unnamed: 2_level_1
Hindi,63,0.6
Greek,365,3.5
Kannada,369,3.6
Danish,428,4.1
Tamil,469,4.5
German,470,4.5
Turkish,474,4.6
Arabic,536,5.2
Dutch,546,5.3
Malayalam,594,5.7


## Train/test split

We start our machine learning task by splitting the data into a train and a test set, so that we can later measure the performance:

In [4]:
x = data["Text"].values
y = data["Language"].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## ML pipeline

The next step is to create a scikit-learn pipeline for the preprocessing and training. The two steps in the pipeline are the Tf-idf vectorizer and a Naive Bayes model.

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


clf_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(token_pattern=r"[\w_]{2,}", lowercase=False)),
    ("naive_bayes", MultinomialNB(alpha=.01)),
])
#clf_pipeline = Pipeline([
#    ("wordcount", CountVectorizer()),
#    ("naive_bayes", MultinomialNB(alpha=.01)),
#])
clf_pipeline.fit(x_train, y_train)


Pipeline(steps=[('tfidf',
                 TfidfVectorizer(lowercase=False, token_pattern='[\\w_]{2,}')),
                ('naive_bayes', MultinomialNB(alpha=0.01))])

## Test prediction and sanity check

In [6]:
pred = clf_pipeline.predict(x_test)

print(metrics.classification_report(y_test, pred))

              precision    recall  f1-score   support

      Arabic       1.00      0.98      0.99       106
      Danish       0.99      0.96      0.97        73
       Dutch       0.99      0.98      0.99       111
     English       0.92      1.00      0.96       291
      French       1.00      0.99      0.99       219
      German       1.00      0.98      0.99        93
       Greek       1.00      1.00      1.00        68
       Hindi       1.00      1.00      1.00        10
     Italian       1.00      0.99      0.99       145
     Kannada       1.00      1.00      1.00        66
   Malayalam       1.00      0.98      0.99       121
  Portugeese       0.99      0.97      0.98       144
     Russian       1.00      0.99      0.99       136
     Spanish       0.99      0.97      0.98       160
    Sweedish       0.99      0.98      0.99       133
       Tamil       1.00      0.99      0.99        87
     Turkish       1.00      0.98      0.99       105

    accuracy              

In [7]:
for sample in ["Ein kleiner deutscher Text", "A small text without meaning", "C'è un pò d'italiano", "Une petite histoire de Paris", "Генсек ООН призвал к соблюдению перемирия во время Олимпиады"]:
    print(sample, "=>", clf_pipeline.predict(np.array([sample]))[0], np.max(clf_pipeline.predict_proba(np.array([sample]))[0]))
    clf_pipeline.predict_proba(np.array([sample]))[0]

Ein kleiner deutscher Text => German 0.768441150069108
A small text without meaning => English 0.9986415618842468
C'è un pò d'italiano => Italian 0.7141641875031932
Une petite histoire de Paris => French 0.9968009803867346
Генсек ООН призвал к соблюдению перемирия во время Олимпиады => Russian 0.9912020536752277


## Serialize model

### 1. Pickle the sklearn model

In [8]:
with (output_folder / "classifier.pickle").open("wb") as f:
    pickle.dump(clf_pipeline, f)

### 2. Convert to and store Onnx model

In [9]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import StringTensorType

tfidf_settings = {
    TfidfVectorizer: {
        "tokenexp": r"[\pL\pN_]{2,}"
    }
}
initial_type = [('string_input', StringTensorType([None, 1]))]
onx = convert_sklearn(clf_pipeline, initial_types=initial_type, options=tfidf_settings)
  
with (output_folder / "classifier.onnx").open("wb") as f:
    f.write(onx.SerializeToString())




### Comparing file sizes

In [12]:
!du -ha ./output/*

3.2M	./output/classifier.onnx
11M	./output/classifier.pickle


## Differences in tokenization and normalization:

In [13]:
vectorizer = TfidfVectorizer(token_pattern=r"[İ\w_]{2,}", lowercase=False)
vectorizer.fit(x_train)

TfidfVectorizer(lowercase=False, token_pattern='[İ\\w_]{2,}')

In [14]:
tfidf_settings = {
    TfidfVectorizer: {
        "tokenexp": r"[\pL\pN_]{2,}"
    }
}
initial_type = [('string_input', StringTensorType([None, 1]))]
onx = convert_sklearn(vectorizer, initial_types=initial_type, options=tfidf_settings)

In [15]:
import onnxruntime

with open("data/vectorizer.onnx", "wb") as f:
    f.write(onx.SerializeToString())
session = onnxruntime.InferenceSession("data/vectorizer.onnx")
inputs = {'string_input': x_test[:1]}
pred_onx = session.run(None, {"string_input": np.array(["И с этими словами она села в его карету, и, даже не"]).reshape(1, 1)})

FileNotFoundError: [Errno 2] No such file or directory: 'data/vectorizer.onnx'

In [None]:
for t in x_train:
    pred_sklearn = np.sum(vectorizer.transform(np.array([t])))
    pred_onx = np.sum(session.run(None, {"string_input": np.array([t]).reshape(1, 1)}))
    if abs(pred_onx - pred_sklearn) > 0.01:
        print(t, pred_sklearn, pred_onx)
        print(sorted(list(vectorizer.inverse_transform(vectorizer.transform(np.array([t])))[0])))
        print(sorted(list(vectorizer.inverse_transform(session.run(None, {"string_input": np.array([t]).reshape(1, 1)})[0])[0])))

## Test prediction with ONNX model

In [None]:
import onnxruntime

session = onnxruntime.InferenceSession(str(output_folder / "classifier.onnx"))
pred_onx = session.run(None, {"string_input": np.array(["И с этими словами она села в его карету, и, даже не"]).reshape(1, 1)})
print("predict", pred_onx[0])
print("predict_proba", pred_onx[1])

predict ['Russian']
predict_proba [{'Arabic': 7.943455784698017e-06, 'Danish': 1.0405490684206598e-05, 'Dutch': 8.303594768221956e-06, 'English': 2.679041699593654e-06, 'French': 3.670612613859703e-06, 'German': 1.1265211469435599e-05, 'Greek': 1.0832965926965699e-05, 'Hindi': 1.1893760529346764e-05, 'Italian': 5.484678695211187e-06, 'Kannada': 1.8420292690279894e-05, 'Malayalam': 1.082171183952596e-05, 'Portugeese': 5.10070185555378e-06, 'Russian': 0.9998569488525391, 'Spanish': 5.064754532213556e-06, 'Sweedish': 6.081358151277527e-06, 'Tamil': 1.3298573321662843e-05, 'Turkish': 1.1620059012784623e-05}]


In [None]:
for sample in ["Ein kleiner deutscher Text", "A small text without meaning", "C'è un pò d'italiano", "Une petite histoire de Paris", "Генсек ООН призвал к соблюдению перемирия во время Олимпиады"]:
    pred_onx = session.run(None, {"string_input": np.array([sample]).reshape(1, 1)})
    print(sample, "=>", pred_onx[0], pred_onx[1][0][pred_onx[0][0]])
    #print("predict_proba", pred_onx[1])

Ein kleiner deutscher Text => ['German'] 0.7684409618377686
A small text without meaning => ['English'] 0.9986410140991211
C'è un pò d'italiano => ['Italian'] 0.7141642570495605
Une petite histoire de Paris => ['French'] 0.9968007802963257
Генсек ООН призвал к соблюдению перемирия во время Олимпиады => ['Russian'] 0.991202175617218


In [None]:
pred_onnx = session.run(None, {"string_input": np.array([s.encode("utf-8") for s in x_test]).reshape(len(x_test), 1)})
print(metrics.classification_report(y_test, pred_onnx[0]))


              precision    recall  f1-score   support

      Arabic       1.00      0.98      0.99       106
      Danish       0.99      0.96      0.97        73
       Dutch       0.99      0.98      0.99       111
     English       0.92      1.00      0.96       291
      French       1.00      0.99      0.99       219
      German       1.00      0.98      0.99        93
       Greek       1.00      1.00      1.00        68
       Hindi       1.00      1.00      1.00        10
     Italian       1.00      0.99      0.99       145
     Kannada       1.00      1.00      1.00        66
   Malayalam       1.00      0.98      0.99       121
  Portugeese       0.99      0.97      0.98       144
     Russian       1.00      0.99      0.99       136
     Spanish       0.99      0.97      0.98       160
    Sweedish       0.99      0.98      0.99       133
       Tamil       1.00      0.99      0.99        87
     Turkish       1.00      0.98      0.99       105

    accuracy              

## Benchmarking

In [None]:
def sample():
    sample_size = 1
    for i in range(0, len(x_test)-sample_size, sample_size):
        yield x_test[i:i+sample_size]

def benchmark_onnx():
    for t in sample():
        pred_onx = session.run(None, {"string_input": np.array([t]).reshape(len(t), 1)})

def benchmark_sklearn():
    for t in sample():
        pred = clf_pipeline.predict(np.array(t))

In [None]:
%timeit benchmark_sklearn()

9.04 s ± 920 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%timeit benchmark_onnx()

2.84 s ± 99.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
