# 1. Loading

In [10]:
from ast import literal_eval
from typing import Dict

import joblib
import numpy as np
import pandas as pd
import xgboost as xgb
from catboost import CatBoostClassifier, Pool
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC

STOP_WORDS = "stop_words.txt"
TRAIN_DATA = "reuters_train.csv"
TEST_DATA = "reuters_test.csv"

In [2]:
def calculate_metrics(test_labels: np.ndarray, 
                      pred_labels: np.ndarray, 
                      average='samples') -> None:
    print(
        f"Precision: {precision_score(test_labels, pred_labels, average=average)}, \n", \
        f"Recall: {recall_score(test_labels, pred_labels, average=average)}, \n", \
        f"F1 Measure: {f1_score(test_labels, pred_labels, average=average)}"
    )


In [3]:
with open(STOP_WORDS) as file:
    stop_words = [line.rstrip() for line in file]

In [4]:
df_train = pd.read_csv(TRAIN_DATA)
df_train.labels = df_train.labels.apply(literal_eval)

In [5]:
df_test = pd.read_csv(TEST_DATA)
df_test.labels = df_test.labels.apply(literal_eval)

In [6]:
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(df_train.labels)
test_labels = mlb.transform(df_test.labels)

# 2. Preprocessing

In [17]:
# import re

def preprocess(text: str) -> str:
    text = text.replace("\n", " ")
    # text = re.sub('(?<=\d),(?=\d)', '', text)  # removing comma symbol between numbers
    # text = re.sub(r"[^\w\d,\s]+",'',text)  # cleaning punktuation
    tokens = text.split()  # split string into tokens by WhiteSpace
    tokens = [t for t in tokens if t.isalpha()]  # removing numbers and symbols from string
    tokens = [t for t in tokens if t not in stop_words]  # remove stopwords
    text = " ".join(tokens).lower()
    return text

In [18]:
df_train["content_prep"] = df_train["content"].apply(preprocess)

In [19]:
df_test["content_prep"] = df_test["content"].apply(preprocess)

In [7]:
def get_vectors(column_name: str = "content",
                train_data: pd.DataFrame = df_train, 
                test_data: pd.DataFrame = df_test) -> Dict[str, csr_matrix]:
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    
    vectorised_train = vectorizer.fit_transform(train_data[column_name])
    vectorised_test = vectorizer.transform(test_data[column_name])
    return {"train": vectorised_train, "test": vectorised_test}

In [8]:
vectors = get_vectors()

In [9]:
vectors_prep = get_vectors(column_name="content_prep")

KeyError: 'content_prep'

# 3.Models

## 3.1 LinearSVC

In [39]:
def train_model(model, 
                train_vectors, 
                test_vectors, 
                train_labels=train_labels,
                test_labels=test_labels,
                *args, **kwargs):
    classifier = OneVsRestClassifier(model(*args, **kwargs))
    classifier.fit(train_vectors, train_labels)

    predictions = classifier.predict(test_vectors)
    calculate_metrics(test_labels, pred_labels=predictions)
    return classifier

In [40]:
svc = train_model(LinearSVC, 
                train_vectors=vectors["train"], 
                test_vectors=vectors["test"], 
                random_state=42)

Precision: 0.8799287055000867, 
 Recall: 0.8616793496886243, 
 F1 Measure: 0.862993617291398


  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
svc_prep = train_model(LinearSVC, 
                train_vectors=vectors_prep["train"], 
                test_vectors=vectors_prep["test"], 
                random_state=42)

Precision: 0.876592690736447, 
 Recall: 0.8583610677350325, 
 F1 Measure: 0.8595997990085437


  _warn_prf(average, modifier, msg_start, len(result))


## 3.2 KNeighborsClassifier

In [42]:
knn = train_model(KNeighborsClassifier, 
                train_vectors=vectors["train"], 
                test_vectors=vectors["test"])

Precision: 0.8068455338412278, 
 Recall: 0.7979560785539587, 
 F1 Measure: 0.7935620137839415


  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
knn_prep = train_model(KNeighborsClassifier, 
                train_vectors=vectors_prep["train"], 
                test_vectors=vectors_prep["test"])

Precision: 0.3890250634868058, 
 Recall: 0.38808656287954063, 
 F1 Measure: 0.38729186054459325


  _warn_prf(average, modifier, msg_start, len(result))


Metrics using preprocessing look worse than basic data, will not use preprocessing further

______

## 3.3 Catboost

In [15]:
train_pool = Pool(vectors["train"], train_labels)
test_pool = Pool(vectors["test"], test_labels)

In [12]:
clf = CatBoostClassifier(
    loss_function='MultiLogloss',
    eval_metric='HammingLoss',
    iterations=500,
)
clf.fit(train_pool, eval_set=test_pool)

Learning rate set to 0.071085
0:	learn: 0.0136697	test: 0.0137684	best: 0.0137684 (0)	total: 8.98s	remaining: 1h 14m 39s
1:	learn: 0.0136783	test: 0.0137573	best: 0.0137573 (1)	total: 17.9s	remaining: 1h 14m 9s
2:	learn: 0.0115516	test: 0.0106253	best: 0.0106253 (2)	total: 26.7s	remaining: 1h 13m 47s
3:	learn: 0.0116989	test: 0.0107357	best: 0.0106253 (2)	total: 35.6s	remaining: 1h 13m 31s
4:	learn: 0.0114658	test: 0.0107099	best: 0.0106253 (2)	total: 44.4s	remaining: 1h 13m 19s
5:	learn: 0.0115044	test: 0.0105443	best: 0.0105443 (5)	total: 53.3s	remaining: 1h 13m 10s
6:	learn: 0.0114329	test: 0.0105149	best: 0.0105149 (6)	total: 1m 2s	remaining: 1h 12m 56s
7:	learn: 0.0109095	test: 0.0103493	best: 0.0103493 (7)	total: 1m 10s	remaining: 1h 12m 40s
8:	learn: 0.0107521	test: 0.0103125	best: 0.0103125 (8)	total: 1m 19s	remaining: 1h 12m 27s
9:	learn: 0.0107049	test: 0.0102646	best: 0.0102646 (9)	total: 1m 28s	remaining: 1h 12m 15s
10:	learn: 0.0107679	test: 0.0102757	best: 0.0102646 (9)	t

<catboost.core.CatBoostClassifier at 0x7facf04113d0>

In [None]:
predictions = clf.predict(test_pool)

In [17]:
calculate_metrics(test_labels, pred_labels=predictions)

Recall: 0.846732729748629
Precision: 0.8325783266823347
F1 Measure: 0.8327294875820876


  _warn_prf(average, modifier, msg_start, len(result))


## 3.4 XGBoost

In [10]:
xgb_estimator = xgb.XGBClassifier(objective='binary:logistic')
multilabel_model = MultiOutputClassifier(xgb_estimator)
multilabel_model.fit(vectors["train"], train_labels)
predictions = multilabel_model.predict(vectors["test"])
calculate_metrics(test_labels, pred_labels=predictions)



Precision: 0.881411378728371, 
 Recall: 0.8769964945436955, 
 F1 Measure: 0.8710263168712598


  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
vectors["train"]

<7769x26147 sparse matrix of type '<class 'numpy.float64'>'
	with 459175 stored elements in Compressed Sparse Row format>

In [15]:
vectors["train"]

<7769x26147 sparse matrix of type '<class 'numpy.float64'>'
	with 459175 stored elements in Compressed Sparse Row format>

In [11]:
xgb_estimator

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)

# 4. Model saving

## 4.1 Using ONNX

ONNX is a good option since the ONNX model checkpoint can be used with different programming languages

In [18]:
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import convert_sklearn, update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes  # noqa
from skl2onnx import to_onnx
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost  # noqa
import onnxruntime as rt

In [19]:
update_registered_converter(
    xgb.XGBClassifier, 'XGBoostXGBClassifier',
    calculate_linear_classifier_output_shapes, convert_xgboost,
    options={'nocl': [True, False], 'zipmap': [True, False, 'columns']})

### First method of saving

In [30]:
onx5 = to_onnx(multilabel_model, vectors["train"].todense().astype(np.float32), target_opset=12)

In [31]:
a = vectors["test"].todense()

sess5 = rt.InferenceSession(onx5.SerializeToString())
res5 = sess5.run(None, {'X': a.astype(np.float32)})

In [34]:
calculate_metrics(test_labels, pred_labels=res5[0])

Precision: 0.0635719649960213, 
 Recall: 0.6201468979292761, 
 F1 Measure: 0.11263173402239948


### Second method of saving

In [63]:
model_onnx = convert_sklearn(
    multilabel_model, 'pipeline_xgboost',
    [('input', FloatTensorType([None, 26147]))],
)

with open("pipeline_xgboost.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())
    

In [21]:
a = vectors["test"].todense()

In [64]:
sess = rt.InferenceSession("pipeline_xgboost.onnx")
pred_onx = sess.run(None, {"input": a.astype(np.float32)})
print("predict", pred_onx[0])
print("predict_proba", pred_onx[1][:1])


predict [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]]
predict_proba [array([[0.9886203 , 0.01137972],
       [0.915495  , 0.08450502],
       [0.9667655 , 0.03323448],
       ...,
       [0.98589504, 0.01410496],
       [0.9899721 , 0.01002789],
       [0.93870187, 0.06129813]], dtype=float32)]


In [65]:
calculate_metrics(test_labels, pred_labels=pred_onx[0])

Precision: 0.0635719649960213, 
 Recall: 0.6201468979292761, 
 F1 Measure: 0.11263173402239948


I see that the output ONNX checkpoint for our model doesn't work as expected. Will use the classic method of saving via joblib

In [70]:
joblib.dump(multilabel_model, "xgboost")

['xgboost']

Testing it

In [11]:
classifier_test = joblib.load('xgboost')

In [12]:
p = classifier_test.predict(vectors["test"])

In [13]:
calculate_metrics(test_labels, p)

Precision: 0.881411378728371, 
 Recall: 0.8769964945436955, 
 F1 Measure: 0.8710263168712598


  _warn_prf(average, modifier, msg_start, len(result))
