In [295]:
import pathlib
import random
import pandas as pd
import numpy as np
import sys

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict

from sklearn.metrics import (
    f1_score, 
    accuracy_score,
    classification_report, 
)

ROOT_DIR = pathlib.Path().absolute()
DATA_DIR = ROOT_DIR / "data"
SUBMISSION_DIR = ROOT_DIR / "submissions"
RANDOM_SEED = 42

## Загрузка и обзор данных

In [3]:
df_trends = pd.read_csv(DATA_DIR / "trends_description.csv")
df = pd.read_csv(DATA_DIR / "train.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,assessment,tags,text,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,0,5652,6.0,"{ASSORTMENT,PROMOTIONS,DELIVERY}","Маленький выбор товаров, хотелось бы ассортиме...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,18092,4.0,"{ASSORTMENT,PRICE,PRODUCTS_QUALITY,DELIVERY}",Быстро,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,13845,6.0,"{DELIVERY,PROMOTIONS,PRICE,ASSORTMENT,SUPPORT}",Доставка постоянно задерживается,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,25060,6.0,"{PRICE,PROMOTIONS,ASSORTMENT}",Наценка и ассортимент расстраивают,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1428,6.0,"{PRICE,PROMOTIONS}",Можно немного скинуть минимальную сумму заказа...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Обучение моделей

### Предобработка данных

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,assessment,tags,text,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,0,5652,6.0,"{ASSORTMENT,PROMOTIONS,DELIVERY}","Маленький выбор товаров, хотелось бы ассортиме...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,18092,4.0,"{ASSORTMENT,PRICE,PRODUCTS_QUALITY,DELIVERY}",Быстро,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,13845,6.0,"{DELIVERY,PROMOTIONS,PRICE,ASSORTMENT,SUPPORT}",Доставка постоянно задерживается,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,25060,6.0,"{PRICE,PROMOTIONS,ASSORTMENT}",Наценка и ассортимент расстраивают,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1428,6.0,"{PRICE,PROMOTIONS}",Можно немного скинуть минимальную сумму заказа...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
X, y = df[["text"]], df[[f"trend_id_res{i}" for i in range(50)]]
X = X.astype("str").copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = RANDOM_SEED)
print(f"X_train.shape is {X_train.shape}")
print(f"y_train.shape is {y_train.shape}")
print(f"X_test.shape is {X_test.shape}")
print(f"y_test.shape is {y_test.shape}")

X_train.shape is (3698, 1)
y_train.shape is (3698, 50)
X_test.shape is (925, 1)
y_test.shape is (925, 50)


In [8]:
X_train

Unnamed: 0,text
1538,"Ну, за [NUM]ч. и [NUM] мин. мне ещё никогда не..."
2991,Доставка всегда осуществляется значительно дол...
2812,Задержка доставки
4515,"Отличный сервис, только бы ассортимент расшири..."
4531,"Поддержка говно, курьеры опаздывают минут на [..."
...,...
4426,"+ быстро. - иногда сумма заказа очень велика, ..."
466,🦉
3092,До самоката я тратил меньше денег в день
3772,О вас редко думаю. Напрягает СТМ. Когда непоня...


###  Проверка качества на тречнировчном датасете

In [236]:
preprocessor = ColumnTransformer(
    [
        ("vetorizer", TfidfVectorizer(analyzer="char_wb", ngram_range = (1,3)), "text")
    ],                         
    remainder = "passthrough"
)

pipeline_multiout = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("clf", MultiOutputClassifier(LogisticRegression(max_iter = 10_000))),
    ]
)
display(pipeline_multiout)

In [20]:
cross_valid = cross_validate(pipeline_multiout, 
                            #  X_train, y_train, 
                             X, y,
                             cv = 5, scoring = ["accuracy"], n_jobs = -1)
print("test_accuracy:", cross_valid["test_accuracy"].mean())

test_accuracy: 0.26648859248859247


In [24]:
model = pipeline_multiout.fit(X_train, y_train)

In [33]:
y_pred = cross_val_predict(pipeline_multiout, X_train, y_train, cv = 2)

In [36]:
# Посмотрим на различные метрики
print(classification_report(y_train, y_pred, zero_division = 0))

              precision    recall  f1-score   support

           0       0.87      0.39      0.54       661
           1       0.89      0.09      0.16       270
           2       0.80      0.37      0.50       486
           3       0.93      0.21      0.35       289
           4       0.00      0.00      0.00       108
           5       0.00      0.00      0.00        44
           6       0.00      0.00      0.00        16
           7       0.00      0.00      0.00        27
           8       1.00      0.01      0.02       109
           9       0.00      0.00      0.00         9
          10       0.00      0.00      0.00        76
          11       0.00      0.00      0.00        87
          12       0.97      0.41      0.57       491
          13       0.00      0.00      0.00        29
          14       0.00      0.00      0.00        62
          15       0.00      0.00      0.00        66
          16       0.00      0.00      0.00       166
          17       0.00    

In [37]:
# Посмотрим на целевую метрику
accuracy_score(y_train, y_pred)

0.19334775554353706

###  Тренировка окончательной модели

In [40]:
pipeline_multiout.fit(X, y)

##  Предсказание и загрузка решения

In [41]:
test =  pd.read_csv(DATA_DIR / "test.csv")

In [42]:
pred_test = pipeline_multiout.predict(test[["text"]].astype("str"))

In [229]:
answer = []
np.apply_along_axis(
    lambda x: answer.append(' '.join(np.where(x == 1)[0].astype(str))),
    axis=1,
    arr=pred_test
)

array([None, None, None, ..., None, None, None], dtype=object)

In [230]:
res = test[["index"]]
res['target'] = answer

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res['target'] = answer


In [232]:
res.to_csv(DATA_DIR / "my_baseline_submission.csv", index=False)

In [233]:
df = pd.read_csv(DATA_DIR / "train_df_clean.csv")
X, y = df[["text"]], df[[f"trend_id_res{i}" for i in range(50)]]
X = X.astype("str").copy()

In [237]:
preprocessor = ColumnTransformer(
    [
        ("vetorizer", TfidfVectorizer(analyzer="char_wb", ngram_range = (1,3)), "text")
    ],                         
    remainder = "passthrough"
)

pipeline_multiout = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("clf", MultiOutputClassifier(LogisticRegression(max_iter = 10_000))),
    ]
)
display(pipeline_multiout)

In [238]:
pipeline_multiout.fit(X, y)

In [239]:
pred_test = pipeline_multiout.predict(test[["text"]].astype("str"))

In [296]:
def write_submission(pred_test, test_df, name):
    answer = []
    np.apply_along_axis(
        lambda x: answer.append(' '.join(np.where(x == 1)[0].astype(str))),
        axis=1,
        arr=pred_test
    )
    test_df[["index"]].assign(target = answer).to_csv(SUBMISSION_DIR / name, index=False)

In [297]:
df = pd.read_csv(DATA_DIR / "train_df_demojitized.csv")
X, y = df[["text"]], df[[f"trend_id_res{i}" for i in range(50)]]
X = X.astype("str").copy()

preprocessor = ColumnTransformer(
    [
        ("vetorizer", TfidfVectorizer(analyzer="char_wb", ngram_range = (1,3)), "text")
    ],                         
    remainder = "passthrough"
)

pipeline_multiout = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("clf", MultiOutputClassifier(LogisticRegression(max_iter = 10_000))),
    ]
)
pipeline_multiout.fit(X, y)
write_submission(
    pipeline_multiout.predict(test[["text"]].astype("str")),
    test,
    'bl_submission_cleaned_and_demojitized_2.csv'
)

In [298]:
cross_valid = cross_validate(
    pipeline_multiout, 
    X, y,
    cv = 5,
    scoring = ["accuracy"],
    n_jobs = -1
)
print("test_accuracy:", cross_valid["test_accuracy"].mean())

test_accuracy: 0.2636949803088674


In [310]:
df = pd.read_csv(DATA_DIR / "train_augmented.csv")
X, y = df[["text"]], df[[f"trend_id_res{i}" for i in range(50)]]
X = X.astype("str").copy()

preprocessor = ColumnTransformer(
    [
        ("vetorizer", TfidfVectorizer(analyzer="char_wb", ngram_range = (1,3)), "text")
    ],                         
    remainder = "passthrough"
)

pipeline_multiout = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("clf", MultiOutputClassifier(LogisticRegression(max_iter = 10_000))),
    ]
)
pipeline_multiout.fit(X, y)
write_submission(
    pipeline_multiout.predict(test[["text"]].astype("str")),
    test,
    'bl_submission_augmented_2.csv'
)

In [309]:
cross_valid = cross_validate(
    pipeline_multiout, 
    X, y,
    cv = 5,
    scoring = ["accuracy"],
    n_jobs = -1
)
print("test_accuracy:", cross_valid["test_accuracy"].mean())

Python(61961) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(61962) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(61963) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(61964) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(61965) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(61966) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(61967) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(61968) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(61969) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(61970) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(61971) Malloc

test_accuracy: 0.12276582759818604


In [306]:
df['text'].shape

(7705,)

In [304]:
len(df['text'].unique())

7034