In [1]:
!pip install -U sentence-transformers
!pip install -U natasha

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m295.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=c5a3365b3e5edc1126b23bc9966427a09d7890f8c195cdc6aec8d37f554d4b59
  Stored in directory: /root/.cache/pip/wheels/bf/06/fb/d59c1e5bd1dac7f6cf61ec0036cc3a10ab8fecaa6b2c3d3ee9
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2
[0mCollecting natasha
  Downloading natasha-1.4.0-py3-none-any.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/3

In [77]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, f1_score

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import RidgeClassifier, LogisticRegression

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

from sklearn.neighbors import KNeighborsClassifier

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN, KMeansSMOTE, SVMSMOTE

import matplotlib.pyplot as plt
import seaborn as sns

from catboost import CatBoostClassifier, Pool

import nltk
from nltk.corpus import stopwords
import re 

from sentence_transformers import SentenceTransformer

In [3]:
# clear html tags
def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)

def stripdates(data):
    p = re.compile(r'[0-9]{2}[\/,:][0-9]{2}[\/,:][0-9]{2,4}')
    return p.sub('', data)

def stripdigits(data):
    p = re.compile(r'\d+')
    return p.sub('', data)

In [180]:
def rnd_perm_text_by_parts(train_example, n_splits=5):
    if len(train_example) < n_splits:
        n_splits = len(train_example)
    text_parts = np.array_split(train_example.split(' '), n_splits)
    rnd_idxs = np.random.choice(len(text_parts), size=len(text_parts), replace=False)
    return ' '.join([' '.join(text_parts[idx]) for idx in rnd_idxs])

rnd_perm_text_by_parts(train.iloc[4][0])

'ребенок также автомобиль другой имущество просить провести спил дерево санитарный обрезка остаться весь улица кроме возникать угроза жизнь здоровье проживать рядом человек число большой вероятность привести обрыв находиться ряд электрический провод случай повреждение который свет довольно старый хрупкий дерево в случай непогода возможный обламывание ветвь неоднократно случаться здравствовать рядом дом а улица светлый вне придомовый территория расти клен это'

In [4]:
from natasha import ( # for lemmatization and stop words
    Segmenter,
    MorphVocab,
    
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,
    
    PER,
    NamesExtractor,

    Doc
)

segmenter = Segmenter()
morph_vocab = MorphVocab()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)

names_extractor = NamesExtractor(morph_vocab)


nltk.download("stopwords")
russian_stopwords = stopwords.words("russian")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [181]:
train = pd.read_csv('../input/classification-of-citizens-appeals/train_dataset_train.csv', index_col=0)
test = pd.read_csv('../input/classification-of-citizens-appeals/test_dataset_test.csv', index_col=0)
sample_subm = pd.read_csv('../input/classification-of-citizens-appeals/Kursk/sample_solution.csv')

Drop

In [6]:
test = test.drop(columns=['Тематика', 'Ответственное лицо'])
train = train.drop(columns=['Тематика', 'Ответственное лицо'])

# train['Категория'].value_counts()
train = train.drop(train[train['Категория'] == 12].index) 

Clean

In [7]:
train['Текст Сообщения'] = train['Текст Сообщения'].apply(striphtml)
test['Текст Сообщения'] = test['Текст Сообщения'].apply(striphtml)

train['Текст Сообщения'] = train['Текст Сообщения'].str.replace('&nbsp;', ' ')
test['Текст Сообщения'] = test['Текст Сообщения'].str.replace('&nbsp;', ' ')

train['Текст Сообщения'] = train['Текст Сообщения'].apply(stripdates)
test['Текст Сообщения'] = test['Текст Сообщения'].apply(stripdates)

train['Текст Сообщения'] = train['Текст Сообщения'].apply(stripdigits)
test['Текст Сообщения'] = test['Текст Сообщения'].apply(stripdigits)

Лематизация

https://github.com/natasha/natasha

In [8]:
def preprocess_text(text):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    return ' '.join([token.lemma for token in doc.tokens if token.pos != 'PUNCT' and token.text not in russian_stopwords])

In [9]:
lemmatize = True

if lemmatize:
    train['Текст Сообщения'] = train['Текст Сообщения'].apply(preprocess_text)
    test['Текст Сообщения'] = test['Текст Сообщения'].apply(preprocess_text)

In [56]:
SentenceTransformer_ = False

if SentenceTransformer_:
    st_model = SentenceTransformer('DeepPavlov/rubert-base-cased-sentence')

    train_text = st_model.encode(train['Текст Сообщения'].values, batch_size=256)
    np.save('./train_text.npy', train_text)

    test_text = st_model.encode(test['Текст Сообщения'].values, batch_size=256)
    np.save('./test_text.npy', test_text)

Split

In [66]:
test_size = 0.33

test_val_split = False

if test_val_split:
    if SentenceTransformer_:
        X_train, X_test, y_train, y_test = train_test_split(train_text, train['Категория'], test_size=test_size, stratify=train['Категория'], random_state=42)
    else:
        X_train, X_test, y_train, y_test = train_test_split(train[['Текст Сообщения']], train['Категория'], test_size=test_size, stratify=train['Категория'], random_state=42)
else:
    X_train, X_test, y_train, y_test = train[['Текст Сообщения']], train[['Текст Сообщения']], train['Категория'], train['Категория'] # no split
    print('test_val_split is off')

test_val_split is off


PCA

In [None]:
def pca_transform(X_train, X_test, test_text, n_comps=128):
    pca = PCA(n_components=n_comps)
    
    if test_text.shape[1] != n_comps:
        test_text = pca.transform(test_text)
    
    return pca.fit_transform(X_train), pca.transform(X_test), test_text

# X_train, X_test, test_text = pca_transform(X_train, X_test, test_text, n_comps=128)

In [None]:
CatBoost_model = True

if CatBoost_model:
    train_pool = Pool(X_train, y_train)
    train_pool_full  = Pool(train_text, train['Категория']) # full test set
    val_pool = Pool(X_test, y_test)
    test_pool = Pool(test_text)

In [None]:
if CatBoost_model:
    model = CatBoostClassifier(task_type="GPU",
                               devices='0:1',
                               verbose=100,
                               eval_metric='AUC', # TotalF1
                               iterations=2000, # 2000
                               random_seed=42,
                               auto_class_weights='Balanced'
                              )

In [None]:
if CatBoost_model:
    model.fit(train_pool, use_best_model=True, eval_set=val_pool)

In [None]:
if CatBoost_model:
    model.get_all_params()

In [None]:
if CatBoost_model:
    val_preds = model.predict(val_pool)
    roc_auc = roc_auc_score(label_binarize(y_test, classes=[0,1,2,3,4,5,6,7,8,9,10,11,13,14,15,16]), label_binarize(val_preds, classes=[0,1,2,3,4,5,6,7,8,9,10,11,13,14,15,16]), multi_class='ovo')
    print(f'roc_auc: {roc_auc}')
    f1_score_ = f1_score(y_test, val_preds, average='macro')
    print(f'f1: {f1_score_}')

Predict test

In [None]:
if CatBoost_model:
    test_preds = model.predict(test_pool)
    test_preds_probas = model.predict_proba(test_pool)
    sample_subm['Категория'] = test_preds
    sample_subm.to_csv('sent_transf_cb_lemmat.csv', index=False)

KFolds

In [27]:
def get_train_data():
    return train_text, train['Категория']

def get_model():
    return CatBoostClassifier(task_type="GPU",
                           devices='0:1',
                           verbose=100,
                           eval_metric='AUC', # TotalF1 F1
                           iterations=1300, # 2000
                           random_seed=42,
                           auto_class_weights='Balanced'
                          )

def calc_roc_auc(y_test, val_preds):
    try:
        roc_auc = roc_auc_score(label_binarize(y_test, classes=[0,1,2,3,4,5,6,7,8,9,10,11,13,14,15,16]), label_binarize(val_preds, classes=[0,1,2,3,4,5,6,7,8,9,10,11,13,14,15,16]), multi_class='ovo')
        print(f'roc_auc: {roc_auc}')
    except:
        print('multi roc_auc cannot be calculated')

In [None]:
if CatBoost_model:
    n_splits = 5
    skf = StratifiedKFold(n_splits=n_splits, shuffle=False, random_state=42)

    X, y = get_train_data()

    all_predicts, all_predict_probas = [], []

    for train_index, test_index in skf.split(X, y):
    #     print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        train_pool = Pool(X_train, y_train)
        val_pool = Pool(X_test, y_test)

        model = get_model()
        model.fit(train_pool, use_best_model=True, eval_set=val_pool)
        val_preds = model.predict(val_pool)

        calc_roc_auc(y_test, val_preds)
        f1_score_ = f1_score(y_test, val_preds, average='macro')
        print(f'f1: {f1_score_}')

        all_predicts.append(model.predict(test_pool))
        all_predict_probas.append(model.predict_proba(test_pool))

    sample_subm['Категория'] = np.median(all_predicts, axis=0).astype(int)
    sample_subm.to_csv('st_cb_5_folds.csv', index=False)

tfidf + trunc_svd+smote_linear

In [67]:
tfidf_ = True

if tfidf_:
    vectorizer = TfidfVectorizer()
    train_enc = vectorizer.fit_transform(X_train['Текст Сообщения'])
    svd = TruncatedSVD(n_components=512, n_iter=7, random_state=42)
    X_train = svd.fit_transform(csr_matrix(train_enc))
    test_enc = vectorizer.transform(X_test['Текст Сообщения'])
    X_test = svd.transform(csr_matrix(test_enc))
    
    test_enc = vectorizer.transform(test['Текст Сообщения'])
    test_enc = svd.transform(csr_matrix(test_enc))

In [72]:
if tfidf_:
    sm = SMOTE(random_state=42, k_neighbors=2)
#     ros = RandomOverSampler(random_state=42)
    X_train, y_train = sm.fit_resample(X_train, y_train)
#     X_train, y_train = ros.fit_resample(X_train, y_train)

Ridge

In [None]:
sc = StandardScaler()

In [73]:
cls_model = RidgeClassifier(alpha=1.0)
cls_model.fit(X_train, y_train)
cls_preds = cls_model.predict(X_test)
d = cls_model.decision_function(X_test)[:]
cls_preds_probas = np.exp(d) / np.sum(np.exp(d))

calc_roc_auc(y_test, cls_preds)
f1_score_ = f1_score(y_test, cls_preds, average='macro')
print(f'f1: {f1_score_}')

roc_auc: 0.9824572127594335
f1: 0.8274807937779632


LogisticRegression

In [93]:
cls_model = LogisticRegression(penalty = 'elasticnet', solver = 'saga', l1_ratio = 0.5, max_iter=1000)
# cls_model = LogisticRegression(max_iter=1000)
cls_model.fit(X_train, y_train)
cls_preds = cls_model.predict(X_test)

d = cls_model.decision_function(X_test)[:]
cls_preds_probas = np.exp(d) / np.sum(np.exp(d))

calc_roc_auc(y_test, cls_preds)
f1_score_ = f1_score(y_test, cls_preds, average='macro')
print(f'f1: {f1_score_}')

roc_auc: 0.9814244728902348
f1: 0.8957264709210198


In [95]:
if tfidf_:
    sample_subm['Категория'] = cls_model.predict(test_enc)
    sample_subm.to_csv('tfidf_smote_linear_full.csv', index=False)

In [None]:
sample_subm['Категория'] = cls_model.predict(test_text)
sample_subm.to_csv('st_linear_pca128.csv', index=False)

In [None]:
# import numpy as np, scipy.stats as st
# st.t.interval(0.95, len(cls_preds_probas[:, 0])-1, loc=np.mean(cls_preds_probas[:, 0]), scale=st.sem(cls_preds_probas[:, 0]))

KNN

In [None]:
for i in range(1, 20):
    print(f'{i} neighbors')
    knn = KNeighborsClassifier(n_neighbors=i, metric='cosine') #  
    knn.fit(X_train, y_train)
    cls_preds = knn.predict(X_test)
    calc_roc_auc(y_test, cls_preds)
    f1_score_ = f1_score(y_test, cls_preds, average='macro')
    print(f'f1: {f1_score_}')

LDA

In [None]:
lda = LinearDiscriminantAnalysis(solver='svd')
lda.fit(X_train, y_train)
cls_preds = lda.predict(X_test)

calc_roc_auc(y_test, cls_preds)
f1_score_ = f1_score(y_test, cls_preds, average='macro')
print(f'f1: {f1_score_}')

Blending

In [None]:
all_predicts = []

for i in range(5):
    test_size = 0.2
    X_train, X_test, y_train, y_test = train_test_split(train_text, train['Категория'], test_size=test_size, stratify=train['Категория'], random_state=i, shuffle=True)
#     X_train, X_test, test_text = pca_transform(X_train, X_test, test_text, n_comps=128)
#     cls_model = LogisticRegression(penalty = 'elasticnet', solver = 'saga', l1_ratio = 0.5, max_iter=1000)
    cls_model = LogisticRegression(max_iter=1000)
    cls_model.fit(X_train, y_train)
    cls_preds = cls_model.predict(X_test)

    d = cls_model.decision_function(X_test)
    cls_preds_probas = np.exp(d) / np.sum(np.exp(d))

    calc_roc_auc(y_test, cls_preds)
    f1_score_ = f1_score(y_test, cls_preds, average='macro')
    print(f'f1: {f1_score_}')
    
    all_predicts.append(cls_model.predict(test_text))
    
sample_subm['Категория'] = np.median(all_predicts, axis=0).astype(int)
sample_subm.to_csv('st_linear_5_splits_0_2.csv', index=False)

roc_auc: 0.6815923522699869
f1: 0.40310293246307083

roc_auc: 0.665101712469827
f1: 0.372676847396016

roc_auc: 0.6764834180379572
f1: 0.39966850504069673

roc_auc: 0.6823410879331754
f1: 0.42793993835065847

roc_auc: 0.6787377585204736
f1: 0.40135458180620626