In [3]:
import pathlib
import random
import pandas as pd
import numpy as np
import sys

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict

from sklearn.metrics import (
    f1_score, 
    accuracy_score,
    classification_report, 
)

ROOT_DIR = pathlib.Path().absolute().parent
DATA_DIR = ROOT_DIR / "data" / "cleared_df_final"
RANDOM_SEED = 42

## Загрузка и обзор данных

In [5]:
df_trends = pd.read_csv("trends_description.csv")
df = pd.read_csv("train.csv")

In [6]:
df.head()

Unnamed: 0,index,assessment,tags,text,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,trend_id_res5,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,5652,6.0,"{ASSORTMENT,PROMOTIONS,DELIVERY}","Маленький выбор товаров, хотелось бы ассортиме...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,18092,4.0,"{ASSORTMENT,PRICE,PRODUCTS_QUALITY,DELIVERY}",Быстро,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,13845,6.0,"{DELIVERY,PROMOTIONS,PRICE,ASSORTMENT,SUPPORT}",Доставка постоянно задерживается,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,25060,6.0,"{PRICE,PROMOTIONS,ASSORTMENT}",Наценка и ассортимент расстраивают,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,15237,5.0,"{ASSORTMENT,PRODUCTS_QUALITY,PROMOTIONS,CATALO...",Доставка просто 👍,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Обучение моделей

### Предобработка данных

In [7]:
df.head()

Unnamed: 0,index,assessment,tags,text,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,trend_id_res5,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,5652,6.0,"{ASSORTMENT,PROMOTIONS,DELIVERY}","Маленький выбор товаров, хотелось бы ассортиме...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,18092,4.0,"{ASSORTMENT,PRICE,PRODUCTS_QUALITY,DELIVERY}",Быстро,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,13845,6.0,"{DELIVERY,PROMOTIONS,PRICE,ASSORTMENT,SUPPORT}",Доставка постоянно задерживается,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,25060,6.0,"{PRICE,PROMOTIONS,ASSORTMENT}",Наценка и ассортимент расстраивают,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,15237,5.0,"{ASSORTMENT,PRODUCTS_QUALITY,PROMOTIONS,CATALO...",Доставка просто 👍,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
X, y = df[["text"]], df[[f"trend_id_res{i}" for i in range(50)]]
X = X.astype("str").copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = RANDOM_SEED)
print(f"X_train.shape is {X_train.shape}")
print(f"y_train.shape is {y_train.shape}")
print(f"X_test.shape is {X_test.shape}")
print(f"y_test.shape is {y_test.shape}")

X_train.shape is (6966, 1)
y_train.shape is (6966, 50)
X_test.shape is (1742, 1)
y_test.shape is (1742, 50)


###  Проверка качества на тречнировчном датасете

In [9]:
preprocessor = ColumnTransformer(
    [
        ("vetorizer", TfidfVectorizer(analyzer="char_wb", ngram_range = (1,3)), "text")
    ],                         
    remainder = "passthrough"
)

pipeline_multiout = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("clf", MultiOutputClassifier(LogisticRegression(max_iter = 10_000))),
    ]
)
display(pipeline_multiout)

In [23]:
X_transformed = preprocessor.fit_transform(X_train)
print(f"Размер данных после преобразования: {X_transformed.shape}")

Размер данных после преобразования: (6966, 11118)


In [27]:
# Получаем имена признаков из TfidfVectorizer
feature_names = preprocessor.named_transformers_["vetorizer"].get_feature_names_out()

# Преобразуем разреженную матрицу в DataFrame с именами столбцов
X_transformed_df = pd.DataFrame(X_transformed.toarray(), columns=feature_names)

# Просмотр первых двух строк
X_transformed_df.head(2)

Unnamed: 0,Unnamed: 1,!,!.1,!!,!(,!),"!,",!?,!з,!к,...,🫰🏻,🫶,🫶🏻,🫶🏻.1,🫶🏼,🫶🏼.1,🫶🏽,🫶🏽.1,🫶🔥,🫶🔥💙
0,0.437215,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.457883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
cross_valid = cross_validate(pipeline_multiout, 
                             X_train, y_train, 
                             cv = 5, scoring = ["accuracy"], n_jobs = -1)
print("test_accuracy:", cross_valid["test_accuracy"].mean())

test_accuracy: 0.5045935766143692


In [11]:
y_pred = cross_val_predict(pipeline_multiout, X_train, y_train, cv = 2)

In [12]:
# Посмотрим на различные метрики
print(classification_report(y_train, y_pred, zero_division = 0))

              precision    recall  f1-score   support

           0       0.78      0.28      0.41       662
           1       0.65      0.05      0.09       278
           2       0.66      0.21      0.31       473
           3       0.72      0.13      0.23       268
           4       0.00      0.00      0.00        98
           5       0.00      0.00      0.00        37
           6       0.00      0.00      0.00        18
           7       0.00      0.00      0.00        26
           8       0.00      0.00      0.00       115
           9       0.00      0.00      0.00         8
          10       0.00      0.00      0.00        84
          11       0.00      0.00      0.00        89
          12       0.61      0.17      0.27       492
          13       0.00      0.00      0.00        28
          14       0.00      0.00      0.00        57
          15       0.00      0.00      0.00        62
          16       0.00      0.00      0.00       160
          17       0.00    

In [13]:
# Посмотрим на целевую метрику
accuracy_score(y_train, y_pred)

0.5005742176284812

###  Тренировка окончательной модели

In [14]:
pipeline_multiout.fit(X_train, y_train)

##  Предсказание и загрузка решения

In [15]:
test =  pd.read_csv("test.csv")

In [16]:
pred_test = pipeline_multiout.predict(test[["text"]].astype("str"))

In [17]:
res = pd.DataFrame(np.hstack([test["index"].values.reshape(test.shape[0], 1), pred_test]),
                  columns = ["index"]+[f"trend_id_res{i}" for i in range(50)])

In [18]:
res.head()

Unnamed: 0,index,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,trend_id_res5,trend_id_res6,trend_id_res7,trend_id_res8,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,5905,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3135,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9285,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4655,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,16778,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
res.iloc[:, 1:].sum()

trend_id_res0     527
trend_id_res1     146
trend_id_res2     507
trend_id_res3     168
trend_id_res4       0
trend_id_res5       0
trend_id_res6       0
trend_id_res7       0
trend_id_res8      18
trend_id_res9       0
trend_id_res10      1
trend_id_res11     17
trend_id_res12    532
trend_id_res13      0
trend_id_res14      0
trend_id_res15      5
trend_id_res16      7
trend_id_res17      0
trend_id_res18      0
trend_id_res19    202
trend_id_res20     79
trend_id_res21      0
trend_id_res22      0
trend_id_res23      0
trend_id_res24      0
trend_id_res25      0
trend_id_res26      0
trend_id_res27    217
trend_id_res28    220
trend_id_res29      0
trend_id_res30    157
trend_id_res31      0
trend_id_res32      0
trend_id_res33      0
trend_id_res34      0
trend_id_res35      2
trend_id_res36      0
trend_id_res37      0
trend_id_res38      0
trend_id_res39      0
trend_id_res40      0
trend_id_res41      0
trend_id_res42      0
trend_id_res43      0
trend_id_res44      0
trend_id_r

In [20]:
res["trend_id_res0"].value_counts()

trend_id_res0
0    16472
1      527
Name: count, dtype: int64

In [21]:
res[["index"]+[f"trend_id_res{i}" for i in range(50)]].to_csv("submission.csv", index=False)