In [1]:
# !sudo apt install sysstat -y

In [2]:
# !pip install catboost
# !pip install pandas
# !pip install scikit-learn

In [3]:
!nvidia-smi

Fri Nov 24 17:11:50 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.41.03              Driver Version: 530.41.03    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-SXM2-16GB            On | 00000000:AF:00.0 Off |                    0 |
| N/A   42C    P0               43W / 300W|   1118MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
!mpstat

Linux 5.4.0-150-generic (5fcd9b9e9a86) 	11/24/23 	_x86_64_	(32 CPU)

17:11:51     CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
17:11:51   [32;22m  all[0m[34;1m   16.88[0m[34;22m    0.00[0m[34;1m    0.39[0m[34;1m    0.01[0m[34;22m    0.00[0m[34;1m    0.05[0m[34;22m    0.00[0m[34;22m    0.00[0m[34;22m    0.00[0m[31;1m   82.68[0m


In [5]:
import pickle

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

In [6]:
train = pd.read_csv("prime99_train.csv")
theme_groups = train["Группа тем"].unique()
theme_groups_dict = {k: ki for ki, k in enumerate(sorted(theme_groups))}
themes = train["Тема"].unique()
train, val = train_test_split(train, random_state=42, test_size=0.1)
test = pd.read_csv("prime99_test.csv")

In [7]:
for dataset in (train, val, test):
    labels = []
    for i, row in dataset.iterrows():
        label = [row["Исполнитель"], row["Группа тем"], row["Тема"]]
        labels.append(label)
    dataset["labels"] = labels

In [8]:
dataset["labels"]

0          [ИГЖН ПК, ЖКХ, Жалобы на управляющие компании]
1       [Министерство здравоохранения, Здравоохранение...
2       [Лысьвенский городской округ, Безопасность, Бе...
3       [Министерство здравоохранения, Коронавирус, Се...
4       [Министерство социального развития ПК, Социаль...
                              ...                        
2308    [АО ПРО ТКО, Мусор/Свалки/ТКО, ★ Уборка/Вывоз ...
2309    [Город Пермь, Благоустройство, Уборка территорий]
2310    [АО ПРО ТКО, Мусор/Свалки/ТКО, Плата за вывоз ...
2311    [Министерство социального развития ПК, Социаль...
2312    [Город Пермь, Дороги, ★ Нарушение правил очист...
Name: labels, Length: 2313, dtype: object

In [20]:
catboost_params = {
    'iterations': 100000,
    'learning_rate': 0.01,
    "loss_function": 'MultiClass',
    'verbose': 100,
    "task_type": "GPU",
    # "task_type": "CPU",
    "devices": '0',
    'early_stopping_rounds': 20,
    'random_seed': 42, 
    'depth': 8,
    # 'dictionaries': [
    #             'Word:token_level_type=Word,min_token_occurrence=5', 
    #             'BiGram:gram_order=2,min_token_occurrence=4'
    #         ],
    'text_processing': [
        'NaiveBayes+Word,BiGram|BoW+Word,BiGram',
        'NaiveBayes+Word|BoW:top_tokens_count=1000+Word,BiGram|BM25+Word'
    ],
    }

def predict(df, classifier, to_zip=True):
    pool = Pool(
        df[["Текст инцидента"]],
        text_features=["Текст инцидента"]
    )
    preds = [p[0] for p in classifier.predict(pool)]
    proba = classifier.predict_proba(pool)
    proba = np.max(proba, axis=1)
    output = [preds, proba]
    if to_zip:
        output = list(zip(*output))
    return output


def train_catboost_classifier(X_train, y_train, X_val, y_val):
    train_pool = Pool(
        X_train, 
        y_train,
        text_features=["Текст инцидента"],
    )
    val_pool = Pool(
        X_val, 
        y_val,
        text_features=["Текст инцидента"]
    )
        
    classifier = CatBoostClassifier(**catboost_params)
    classifier.fit(train_pool, eval_set=val_pool, early_stopping_rounds=10)

    preds = classifier.predict(val_pool)
    return classifier, preds

In [21]:
train_columns = ["Исполнитель",
                 "Группа тем",
                 "Тема"
                ]
classifiers = dict()

for c_i, column in enumerate(train_columns):
    print(column)
    model_cols = ["Текст инцидента"]
    # leads to a train leak
    # model_cols += [c + "_pred" for c in train_columns[:c_i]]
    classifier, preds = train_catboost_classifier(
        train[model_cols], train[column],
        val[model_cols], val[column]
        )
    val_score = f1_score(val[column], preds, average="weighted")
    print("val", val_score)

    preds = predict(test, classifier, to_zip=False)[0]
    test_score = f1_score(test[column], preds, average="weighted")
    print(test_score)
    with open(f"catboost_{column}2.pcl", "wb") as f:
        pickle.dump(classifier, f)
    with open("catboost_preds", "a") as f:
        f.write(f"2{column}\t{val_score}\t{test_score}\n")

Исполнитель
0:	learn: 2.2631963	test: 2.2600072	best: 2.2600072 (0)	total: 26.6ms	remaining: 44m 23s
100:	learn: 1.2848304	test: 1.2067782	best: 1.2067782 (100)	total: 2.92s	remaining: 48m 3s
200:	learn: 1.1162596	test: 1.0329027	best: 1.0329027 (200)	total: 6.07s	remaining: 50m 14s
300:	learn: 1.0563340	test: 0.9751599	best: 0.9751599 (300)	total: 8.86s	remaining: 48m 55s
400:	learn: 1.0267038	test: 0.9498636	best: 0.9498636 (400)	total: 11.6s	remaining: 48m 6s
500:	learn: 1.0067088	test: 0.9362462	best: 0.9362462 (500)	total: 14.3s	remaining: 47m 15s
600:	learn: 0.9916882	test: 0.9272124	best: 0.9272124 (600)	total: 16.9s	remaining: 46m 33s
700:	learn: 0.9792459	test: 0.9206617	best: 0.9206617 (700)	total: 19.4s	remaining: 45m 50s
800:	learn: 0.9677171	test: 0.9148259	best: 0.9148259 (800)	total: 22s	remaining: 45m 18s
900:	learn: 0.9575060	test: 0.9102910	best: 0.9102910 (900)	total: 24.5s	remaining: 44m 50s
1000:	learn: 0.9481836	test: 0.9068040	best: 0.9068040 (1000)	total: 27s	re