In [1]:
import pathlib
import random
import pandas as pd
import numpy as np
import sys

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict

from sklearn.metrics import (
    f1_score, 
    accuracy_score,
    classification_report, 
)

ROOT_DIR = pathlib.Path().absolute()
DATA_DIR = ROOT_DIR / "data"
RANDOM_SEED = 42

## Загрузка и обзор данных

In [2]:
df_trends = pd.read_csv(DATA_DIR / "trends_description.csv")
df = pd.read_csv(DATA_DIR / "train.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,assessment,tags,text,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,0,5652,6.0,"{ASSORTMENT,PROMOTIONS,DELIVERY}","Маленький выбор товаров, хотелось бы ассортиме...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,18092,4.0,"{ASSORTMENT,PRICE,PRODUCTS_QUALITY,DELIVERY}",Быстро,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,13845,6.0,"{DELIVERY,PROMOTIONS,PRICE,ASSORTMENT,SUPPORT}",Доставка постоянно задерживается,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,25060,6.0,"{PRICE,PROMOTIONS,ASSORTMENT}",Наценка и ассортимент расстраивают,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1428,6.0,"{PRICE,PROMOTIONS}",Можно немного скинуть минимальную сумму заказа...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Обучение моделей

### Предобработка данных

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,assessment,tags,text,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,0,5652,6.0,"{ASSORTMENT,PROMOTIONS,DELIVERY}","Маленький выбор товаров, хотелось бы ассортиме...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,18092,4.0,"{ASSORTMENT,PRICE,PRODUCTS_QUALITY,DELIVERY}",Быстро,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,13845,6.0,"{DELIVERY,PROMOTIONS,PRICE,ASSORTMENT,SUPPORT}",Доставка постоянно задерживается,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,25060,6.0,"{PRICE,PROMOTIONS,ASSORTMENT}",Наценка и ассортимент расстраивают,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1428,6.0,"{PRICE,PROMOTIONS}",Можно немного скинуть минимальную сумму заказа...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
X, y = df[["text"]], df[[f"trend_id_res{i}" for i in range(50)]]
X = X.astype("str").copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = RANDOM_SEED)
print(f"X_train.shape is {X_train.shape}")
print(f"y_train.shape is {y_train.shape}")
print(f"X_test.shape is {X_test.shape}")
print(f"y_test.shape is {y_test.shape}")

X_train.shape is (3698, 1)
y_train.shape is (3698, 50)
X_test.shape is (925, 1)
y_test.shape is (925, 50)


###  Проверка качества на тречнировчном датасете

In [6]:
preprocessor = ColumnTransformer(
    [
        ("vetorizer", TfidfVectorizer(analyzer="char_wb", ngram_range = (1,3)), "text")
    ],                         
    remainder = "passthrough"
)

pipeline_multiout = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("clf", MultiOutputClassifier(LogisticRegression(max_iter = 10_000))),
    ]
)
display(pipeline_multiout)

In [7]:
cross_valid = cross_validate(pipeline_multiout, 
                             X_train, y_train, 
                             cv = 5, scoring = ["accuracy"], n_jobs = -1)
print("test_accuracy:", cross_valid["test_accuracy"].mean())

test_accuracy: 0.23417913177047142


In [8]:
y_pred = cross_val_predict(pipeline_multiout, X_train, y_train, cv = 2)

In [9]:
# Посмотрим на различные метрики
print(classification_report(y_train, y_pred, zero_division = 0))

              precision    recall  f1-score   support

           0       0.87      0.39      0.54       661
           1       0.89      0.09      0.16       270
           2       0.80      0.37      0.50       486
           3       0.93      0.21      0.35       289
           4       0.00      0.00      0.00       108
           5       0.00      0.00      0.00        44
           6       0.00      0.00      0.00        16
           7       0.00      0.00      0.00        27
           8       1.00      0.01      0.02       109
           9       0.00      0.00      0.00         9
          10       0.00      0.00      0.00        76
          11       0.00      0.00      0.00        87
          12       0.97      0.41      0.58       491
          13       0.00      0.00      0.00        29
          14       0.00      0.00      0.00        62
          15       0.00      0.00      0.00        66
          16       0.00      0.00      0.00       166
          17       0.00    

In [10]:
# Посмотрим на целевую метрику
accuracy_score(y_train, y_pred)

0.19307733910221742

###  Тренировка окончательной модели

In [11]:
pipeline_multiout.fit(X_train, y_train)

##  Предсказание и загрузка решения

In [12]:
test =  pd.read_csv(DATA_DIR / "test.csv")

In [13]:
pred_test = pipeline_multiout.predict(test[["text"]].astype("str"))

In [14]:
res = pd.DataFrame(np.hstack([test["index"].values.reshape(test.shape[0], 1), pred_test]),
                  columns = ["index"]+[f"trend_id_res{i}" for i in range(50)])

In [15]:
res.head()

Unnamed: 0,index,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,trend_id_res5,trend_id_res6,trend_id_res7,trend_id_res8,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,3135,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4655,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22118,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,23511,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,45,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
res.iloc[:, 1:].sum()

trend_id_res0     740
trend_id_res1     174
trend_id_res2     706
trend_id_res3     222
trend_id_res4       0
trend_id_res5       0
trend_id_res6       0
trend_id_res7       0
trend_id_res8      24
trend_id_res9       0
trend_id_res10      0
trend_id_res11     29
trend_id_res12    742
trend_id_res13      0
trend_id_res14      0
trend_id_res15      5
trend_id_res16     18
trend_id_res17      0
trend_id_res18    136
trend_id_res19    295
trend_id_res20     69
trend_id_res21      7
trend_id_res22      0
trend_id_res23      3
trend_id_res24      0
trend_id_res25      0
trend_id_res26      0
trend_id_res27    405
trend_id_res28    499
trend_id_res29      0
trend_id_res30    208
trend_id_res31      0
trend_id_res32      0
trend_id_res33      0
trend_id_res34      0
trend_id_res35      3
trend_id_res36      5
trend_id_res37      0
trend_id_res38      0
trend_id_res39      0
trend_id_res40      0
trend_id_res41      0
trend_id_res42      0
trend_id_res43      0
trend_id_res44      0
trend_id_r

In [17]:
res["trend_id_res0"].value_counts()

trend_id_res0
0    8275
1     740
Name: count, dtype: int64

In [18]:
res[["index"]+[f"trend_id_res{i}" for i in range(50)]].to_csv(DATA_DIR / "submission.csv", index=False)