In [38]:
import pandas as pd
from catboost import CatBoostClassifier
import sklearn
import joblib
import os
import sklearn.preprocessing as preprocessing
import sklearn.model_selection as model_selection
import sklearn.ensemble as ensemble
import sklearn.metrics as metrics
import sklearn.pipeline as pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [19]:
df = pd.read_csv("dataset2.zip")
df.head()

Unnamed: 0,Product ID,Product Title,Merchant ID,Cluster ID,Cluster Label,Category ID,Category Label
0,1,apple iphone 8 plus 64gb silver,1,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
1,2,apple iphone 8 plus 64 gb spacegrau,2,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
2,3,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,3,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
3,4,apple iphone 8 plus 64gb space grey,4,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
4,5,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,5,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones


In [11]:
df.describe()

Unnamed: 0,Product ID,Merchant ID,Cluster ID,Category ID
count,35311.0,35311.0,35311.0,35311.0
mean,26150.800176,120.501883,30110.687633,2618.14293
std,13498.19122,117.045557,18410.265642,3.600708
min,1.0,1.0,1.0,2612.0
25%,14958.5,17.0,6123.0,2615.0
50%,27614.0,75.0,40630.0,2619.0
75%,37508.5,253.0,44059.0,2622.0
max,47358.0,371.0,47525.0,2623.0


In [20]:
df[[" Category ID", " Category Label"]].drop_duplicates().apply(
    lambda row: print(f'({row[" Category ID"]}, "{row[" Category Label"]}"),'),
    axis=1
)

(2612, "Mobile Phones"),
(2614, "TVs"),
(2615, "CPUs"),
(2617, "Digital Cameras"),
(2618, "Microwaves"),
(2619, "Dishwashers"),
(2620, "Washing Machines"),
(2621, "Freezers"),
(2622, "Fridge Freezers"),
(2623, "Fridges"),


0        None
4081     None
7645     None
11507    None
14204    None
16546    None
19970    None
24014    None
26226    None
31727    None
dtype: object

In [22]:
X = df[" Cluster Label"]
y = df[" Category ID"]

X_train, X_test, y_train, y_test = model_selection.train_test_split(
X, y, test_size=0.2, random_state=42
)

pipe = pipeline.Pipeline(
[
(
"encoder",
preprocessing.OneHotEncoder(handle_unknown="ignore"),
),
("model", ensemble.GradientBoostingClassifier()),
]
)

In [24]:
pipe.fit(pd.DataFrame(X_train), y_train)

pipe.score(pd.DataFrame(X_test), y_test)

0.3972816083817075

In [28]:
joblib.dump(pipe,"base.joblib")

['base.joblib']

In [31]:
X = df[" Cluster Label"]
y = df[" Category ID"]

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [36]:
pipe = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        ("clf", MultinomialNB()),
    ]
)

pipe.fit(X_train, y_train)
preditions = pipe.predict(X_test)

print(classification_report(y_test, preditions))

              precision    recall  f1-score   support

        2612       1.00      0.98      0.99       818
        2614       0.96      0.99      0.97       723
        2615       1.00      1.00      1.00       771
        2617       1.00      0.99      1.00       542
        2618       0.91      0.73      0.81       469
        2619       0.87      0.72      0.79       662
        2620       0.99      0.90      0.94       796
        2621       0.99      0.55      0.70       439
        2622       0.64      0.97      0.77      1115
        2623       0.92      0.79      0.85       728

    accuracy                           0.89      7063
   macro avg       0.93      0.86      0.88      7063
weighted avg       0.91      0.89      0.89      7063



In [37]:
joblib.dump(pipe, "tfidf.joblib")

['tfidf.joblib']

In [42]:
X = pd.DataFrame(df[" Cluster Label"])
y = df[" Category ID"]

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=42
)

cat = CatBoostClassifier(
    learning_rate=0.01,
    random_seed=42,
    task_type="GPU",
    text_features=[" Cluster Label"],
    logging_level="Silent"
)

cat.fit(X_train, y_train)
pred = cat.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

        2612       1.00      0.97      0.98       818
        2614       0.90      1.00      0.95       723
        2615       1.00      1.00      1.00       771
        2617       1.00      0.99      0.99       542
        2618       0.87      0.74      0.80       469
        2619       0.57      0.75      0.65       662
        2620       0.94      0.94      0.94       796
        2621       0.71      0.61      0.66       439
        2622       0.74      0.70      0.72      1115
        2623       0.72      0.66      0.69       728

    accuracy                           0.84      7063
   macro avg       0.84      0.84      0.84      7063
weighted avg       0.85      0.84      0.84      7063



In [43]:
cat.save_model("catboost.model")

In [56]:
def catboost_model_predict(data: pd.DataFrame):
    return catboost_model.predict(data)

catboost_model = catboost.CatBoostClassifier().load_model("catboost.model")
catboost_model_predict(["123"])

array([2614], dtype=int64)

In [59]:
dataframe = pd.DataFrame(["apple"])

In [63]:
print(catboost_model_predict(["apple"]))

[2614]
