In [1]:
!pip install natasha

Collecting natasha
  Downloading natasha-1.4.0-py3-none-any.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting razdel>=0.5.0
  Downloading razdel-0.5.0-py3-none-any.whl (21 kB)
Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting slovnet>=0.3.0
  Downloading slovnet-0.5.0-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting navec>=0.9.0
  Downloading navec-0.10.0-py3-none-any.whl (23 kB)
Collecting ipymarkup>=0.8.0
  Downloading ipymarkup-0.9.0-py3-none-any.whl (14 kB)
Collecting yargy>=0.14.0
  Downloading yargy-0.15.0-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [3]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, f1_score

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt
import seaborn as sns

from catboost import CatBoostClassifier, Pool

import nltk
from nltk.corpus import stopwords
import re 

In [4]:
# clear html tags
def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)

def stripdates(data):
    p = re.compile(r'[0-9]{2}[\/,:][0-9]{2}[\/,:][0-9]{2,4}')
    return p.sub('', data)

def stripdigits(data):
    p = re.compile(r'\d+')
    return p.sub('', data)

In [5]:
from natasha import ( # for lemmatization and stop words
    Segmenter,
    MorphVocab,
    
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,
    
    PER,
    NamesExtractor,

    Doc
)

segmenter = Segmenter()
morph_vocab = MorphVocab()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)

names_extractor = NamesExtractor(morph_vocab)


nltk.download("stopwords")
russian_stopwords = stopwords.words("russian")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
train = pd.read_csv('../input/classification-of-citizens-appeals/train_dataset_train.csv', index_col=0)
test = pd.read_csv('../input/classification-of-citizens-appeals/test_dataset_test.csv', index_col=0)
sample_subm = pd.read_csv('../input/classification-of-citizens-appeals/Kursk/sample_solution.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
len(train), len(test)

In [None]:
train['Категория'].value_counts()

In [None]:
train['Тематика'].value_counts().head(20)

In [None]:
test['Тематика'].value_counts().head(20)

In [None]:
train_c = train.copy()
train_c['Тематика'] = train_c['Тематика'].astype('category').cat.codes
train_c['Ответственное лицо'] = train_c['Ответственное лицо'].astype('category').cat.codes
train_c.corr()

Drop

In [7]:
test = test.drop(columns=['Тематика', 'Ответственное лицо'])
train = train.drop(columns=['Тематика', 'Ответственное лицо'])

# train['Категория'].value_counts()
train = train.drop(train[train['Категория'] == 12].index) 

Clean

In [8]:
train['Текст Сообщения'] = train['Текст Сообщения'].apply(striphtml)
test['Текст Сообщения'] = test['Текст Сообщения'].apply(striphtml)

train['Текст Сообщения'] = train['Текст Сообщения'].str.replace('&nbsp;', ' ')
test['Текст Сообщения'] = test['Текст Сообщения'].str.replace('&nbsp;', ' ')

train['Текст Сообщения'] = train['Текст Сообщения'].apply(stripdates)
test['Текст Сообщения'] = test['Текст Сообщения'].apply(stripdates)

train['Текст Сообщения'] = train['Текст Сообщения'].apply(stripdigits)
test['Текст Сообщения'] = test['Текст Сообщения'].apply(stripdigits)

Лематизация

https://github.com/natasha/natasha

In [9]:
def preprocess_text(text):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    return ' '.join([token.lemma for token in doc.tokens if token.pos != 'PUNCT' and token.text not in russian_stopwords])

In [10]:
train['Текст Сообщения'] = train['Текст Сообщения'].apply(preprocess_text)
test['Текст Сообщения'] = test['Текст Сообщения'].apply(preprocess_text)

In [13]:
train['Текст Сообщения']

id
2246    помочь начальник льговский рэс реагировать жал...
380     по фасад дом адрес ул урицкий проходить труба ...
2240    агресивный собака на радуга стая подрасти щено...
596     на пересечение улица сосновский береговой зава...
1797    здравствовать рядом дом а улица светлый вне пр...
                              ...                        
1356         вечерний время появиться вонь грибной радуга
243     добрый день хотеть сообщить проблема возле наш...
2350    состоять засохнуть дерево больший береза один ...
1937    пожалуйста роман владимирович скорый время дал...
1185    сдать пцр-тест г результат тест портал госуслу...
Name: Текст Сообщения, Length: 1999, dtype: object

Split

In [139]:
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(train.drop(columns=['Категория']), train['Категория'], test_size=test_size, stratify=train['Категория'], random_state=42)

In [None]:
y_test.value_counts()

In [None]:
len(y_test.value_counts())

In [140]:
train_pool = Pool(X_train, y_train, text_features=['Текст Сообщения'])
train_pool_full  = Pool(train.drop(columns=['Категория']), train['Категория'], text_features=['Текст Сообщения']) # full test set
val_pool = Pool(X_test, y_test, text_features=['Текст Сообщения'])
test_pool = Pool(test, text_features=['Текст Сообщения'])

https://github.com/catboost/tutorials/blob/master/text_features/text_features_in_catboost.ipynb

https://github.com/catboost/catboost/blob/master/catboost/tutorials/events/2020_11_18_catboost_tutorial/text_embedding_features.ipynb

In [229]:
catboost_params = {
#     'dictionaries': [
#         'Word:token_level_type=Word,min_token_occurrence=5', 
#         'BiGram:gram_order=2,min_token_occurrence=4',
#     ],
    'text_processing':
    {'dictionaries': [{'start_token_id': '0',
    'occurrence_lower_bound': '3',
    'skip_step': '0',
    'end_of_word_token_policy': 'Insert',
    'token_level_type': 'Word',
    'end_of_sentence_token_policy': 'Skip',
    'gram_order': '2',
    'max_dictionary_size': '100000',
    'dictionary_id': 'BiGram'},
    {'start_token_id': '0',
    'occurrence_lower_bound': '5',
    'skip_step': '0',
    'end_of_word_token_policy': 'Insert',
    'token_level_type': 'Word',
    'end_of_sentence_token_policy': 'Skip',
    'gram_order': '1',
    'max_dictionary_size': '50000',
    'dictionary_id': 'Word'}]},
#     {
#     'feature_processing': {'default': [{'dictionaries_names': ['BiGram', 'Word'],
#      'feature_calcers': ['BoW'],
#      'tokenizers_names': ['Space']},
#     {'dictionaries_names': ['Word'],
#      'feature_calcers': ['BoW'],
#      'tokenizers_names': ['Space']}]},
#                         }
}

In [235]:
model = CatBoostClassifier(task_type="GPU",
                           devices='0:1',
                           verbose=100,
                           eval_metric='AUC', # TotalF1 F1 AUC Accuracy
                           iterations=1300, # 2000
                           random_seed=42,
                           auto_class_weights='Balanced',
#                            **catboost_params,
                          )

In [236]:
model.fit(train_pool, ) # use_best_model=True, eval_set=val_pool
# model.fit(train_pool_full)

Learning rate set to 0.055365


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	total: 21.9ms	remaining: 28.4s
100:	total: 2.01s	remaining: 23.8s
200:	total: 3.96s	remaining: 21.6s
300:	total: 5.89s	remaining: 19.6s
400:	total: 8.18s	remaining: 18.3s
500:	total: 10.1s	remaining: 16.1s
600:	total: 12s	remaining: 13.9s
700:	total: 13.9s	remaining: 11.9s
800:	total: 15.8s	remaining: 9.83s
900:	total: 18.1s	remaining: 8.02s
1000:	total: 20s	remaining: 5.97s
1100:	total: 21.9s	remaining: 3.96s
1200:	total: 23.8s	remaining: 1.96s
1299:	total: 25.7s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f0535f76910>

In [172]:
model.get_all_params()

{'nan_mode': 'Min',
 'gpu_ram_part': 0.95,
 'eval_metric': 'AUC',
 'iterations': 1300,
 'leaf_estimation_method': 'Newton',
 'observations_to_bootstrap': 'TestOnly',
 'grow_policy': 'SymmetricTree',
 'boosting_type': 'Plain',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'devices': '0:1',
 'pinned_memory_bytes': '104857600',
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': False,
 'gpu_cat_features_storage': 'GpuRam',
 'fold_size_loss_normalization': False,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'use_best_model': False,
 'meta_l2_frequency': 0,
 'class_names': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16],
 'random_seed': 42,
 'depth': 6,
 'border_count': 128,
 'min_fold_size': 100,
 'class_weights': [1.996875047683716,
  37.588233947753906,
  319.5,
  1,
  8.875,
  79.875,
  91.28571319580078,
  35.5,
  6.870967864990234,
  213,
  19.96875,
  49

In [237]:
len(np.unique(val_preds))

11

Решено валидироваться по паблик лидерборду, т.к. не хватает примеров на все классы

In [238]:
val_preds = model.predict(val_pool)
val_preds_probas = model.predict_proba(val_pool)
try:
    roc_auc = roc_auc_score(label_binarize(y_test, classes=[0,1,2,3,4,5,6,7,8,9,10,11,13,14,15,16]), label_binarize(val_preds, classes=[0,1,2,3,4,5,6,7,8,9,10,11,13,14,15,16]), multi_class='ovo')
    print(roc_auc)
except:
    print('cant be calculate roc_auc')

f1_score_ = f1_score(y_test, val_preds, average='macro')
print(f'f1: {f1_score_}')

0.711149093187903
f1: 0.4440335174038945


f1: 0.4618430328371731

tests

In [None]:
val_preds_1 = model.predict(val_pool)
val_preds_probas_1 = model.predict_proba(val_pool)

In [None]:
np.std(np.stack([val_preds_probas, val_preds_probas_1]), axis=0).mean(axis=0).mean()

In [None]:
from scipy.stats import ttest_rel
for i in range(0, 15):
    print(ttest_rel(val_preds_probas[:, i], val_preds_probas_1[:, i]) )

Blending different seeds

In [None]:
all_predicts = []
all_predict_probas = []

for i in range(0, 15):
    print(f'seed: {i}')
    model = CatBoostClassifier(task_type="GPU",
                           devices='0:1',
                           verbose=500,
                           eval_metric='AUC',
                           iterations=1300, # 2000
                           random_seed=i,
                           auto_class_weights='Balanced'
                          )
    model.fit(train_pool_full)
    all_predicts.append(model.predict(test_pool))
    all_predict_probas.append(model.predict_proba(test_pool))

sample_subm['Категория'] = np.median(all_predicts, axis=0).astype(int)
sample_subm.to_csv('base_cat_boost_lemmatize_15_random_seeds.csv', index=False)

KFolds

In [None]:
n_splits = 3
skf = StratifiedKFold(n_splits=n_splits)

def get_train_data():
    return train.drop(columns=['Категория']), train['Категория']

def get_model():
    return CatBoostClassifier(task_type="GPU",
                           devices='0:1',
                           verbose=100,
                           eval_metric='AUC', # TotalF1 F1
                           iterations=1300, # 2000
                           random_seed=42,
                           auto_class_weights='Balanced'
                          )

def calc_roc_auc(y_test, val_preds):
    try:
        roc_auc = roc_auc_score(label_binarize(y_test, classes=[0,1,2,3,4,5,6,7,8,9,10,11,13,14,15,16]), label_binarize(val_preds, classes=[0,1,2,3,4,5,6,7,8,9,10,11,13,14,15,16]), multi_class='ovo')
        print(f'roc_auc: {roc_auc}')
    except:
        print('multi roc_auc cannot be calculated')

X, y = get_train_data()

all_predicts, all_predict_probas = [], []

for train_index, test_index in skf.split(X, y):
#     print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    train_pool = Pool(X_train, y_train, text_features=['Текст Сообщения'])
    val_pool = Pool(X_test, y_test, text_features=['Текст Сообщения'])
    
    model = get_model()
    model.fit(train_pool, use_best_model=True, eval_set=val_pool)
    val_preds = model.predict(val_pool)

    calc_roc_auc(y_test, val_preds)
    f1_score_ = f1_score(y_test, val_preds, average='macro')
    print(f'f1: {f1_score_}')
        
    all_predicts.append(model.predict(test_pool))
    all_predict_probas.append(model.predict_proba(test_pool))
           
sample_subm['Категория'] = np.median(all_predicts, axis=0).astype(int)
sample_subm.to_csv('base_cb_lemmat_15_folds.csv', index=False)

Metrics

In [None]:
print(classification_report(y_test, val_preds))

matrix = confusion_matrix(y_test, val_preds)
matrix.diagonal()/matrix.sum(axis=0)

plt.figure(figsize=(8, 6), dpi=80)
sns.heatmap(matrix, annot=True, fmt='d')

In [None]:
test_preds = model.predict(test_pool)
test_preds_probas = model.predict_proba(test_pool)
sample_subm['Категория'] = test_preds
sample_subm.to_csv('base_cat_boost_lemmatize_0_3.csv', index=False)

In [None]:
sample_subm['Категория'].value_counts()

In [None]:
# class MetricsCheckerCallback:
#     def after_iteration(self, info):
#         for dataset_name in ['learn', 'validation_0', 'validation_1']:
#             assert dataset_name in info.metrics
#             for metric_name in metric_names:
#                 assert metric_name in info.metrics[dataset_name]
#                 assert len(info.metrics[dataset_name][metric_name]) == info.iteration
#         return True

# model.fit(train_data, train_labels,
#           callbacks=[MetricsCheckerCallback()],
#           eval_set=[validation_0, validation_1])

# https://wandb.ai/gusdim/catboost/overview?workspace=user-dimka11