In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier

from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from datasets import Dataset
from random import shuffle
from transformers import AutoTokenizer
import evaluate

from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize, download
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from joblib import dump, load

from tqdm.auto import tqdm

from natasha import (
    Segmenter,
    NewsNERTagger,
    ORG,
    Doc,
    NewsEmbedding,
)

import pandas as pd
import numpy as np

import catboost
import optuna

import pickle
import string
import time
import re

In [14]:
emb = NewsEmbedding()
segmenter = Segmenter()
ner_tagger = NewsNERTagger(emb)

In [15]:
stemmer = SnowballStemmer("russian")
russian_stopwords = stopwords.words("russian")
russian_stopwords.extend(
    ['это', 'как', 'так', 'и', 'в', 'над', 'к', 'до', 'не', 'на', 'но', 'за', 'то', 'с', 'ли', 'а', 'во', 'от', 'со',
     'для', 'о', 'же', 'ну', 'вы', 'бы', 'что', 'кто', 'он', 'она', 'оно', 'из-за', 'также'])

# Чтение данных

In [16]:
mentions = pd.read_csv('mentions.csv', index_col=0)
sentiment = pd.read_csv('sentiment.csv', index_col=0)

issuers = pd.read_excel('issuers.xlsx', index_col=0)
additional_ner_data = pd.read_excel('names and synonyms.xlsx', index_col=0)

# pickle

with open('sentiment_texts.pickle', 'rb') as f:
    sentiment_texts = pickle.loads(f.read())
    
with open('mentions texts.pickle', 'rb') as f:
    mentions_texts = pickle.loads(f.read())

Краткое описание дата сета:

1)      Корректное нахождение компаний. Релевантные таблицы:
a.       **mentions.csv**  - содержит id канала, id сообщения и id упоминаемой компании  
b.       **mentions_texts.pickle** – содержит id канала, id сообщения и текст этого сообщения

2)      Корректное распознавание сентимента. Релевантные таблицы:
a.       **sentiment.csv** – содержит id канала, id сообщения, id компании и score сентимента  
b.       **sentiment_texts.pickle** - содержит id канала, id сообщения и текст этого сообщения


# Нахождение упоминаний
Для начала поработаем с доп. данными по всем возможным вариациям названий компаний

In [17]:
additional_ner_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 255 entries, 1 to 274
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   EMITENT_FULL_NAME  255 non-null    object 
 1   VeryOddCompany     0 non-null      float64
 2   BGTicker           103 non-null    object 
 3   BGTicker.1         166 non-null    object 
 4   Unnamed: 5         207 non-null    object 
 5   Unnamed: 6         165 non-null    object 
 6   Unnamed: 7         103 non-null    object 
 7   Unnamed: 8         53 non-null     object 
 8   Unnamed: 9         16 non-null     object 
 9   Unnamed: 10        5 non-null      object 
 10  Unnamed: 11        2 non-null      object 
 11  Unnamed: 12        1 non-null      object 
 12  Unnamed: 13        1 non-null      object 
 13  Unnamed: 14        1 non-null      object 
dtypes: float64(1), object(13)
memory usage: 29.9+ KB


In [18]:
def combine_values(row):
    return [value for value in row if not pd.isna(value)]

In [19]:
def delete_substrings(text:str, substring:str):
    if isinstance(text, str):
        if substring in text:
            text = text.replace(substring, '')
        
    return text

In [20]:
additional_ner_data.BGTicker = additional_ner_data.BGTicker.apply(lambda x: delete_substrings(x, 'RX')) #Избавляемся от приставки RX

In [24]:
additional_ner_data['all_companies_names_mentions'] = additional_ner_data.apply(combine_values, axis=1)

In [25]:
all_names_dict = additional_ner_data['all_companies_names_mentions'].to_dict() #Создадим словарь, где id компании сопоставим список из ее встречаемых названий

In [26]:
all_names_dict = {company_id: list(map(str.lower, company_names)) for company_id, company_names in all_names_dict.items()}

Теперь у нас есть словарь, и по ключу (issuerid) мы получим все вариации упоминания определенной компании  
Вернемся к датасетам **mentions** и **mentions_texts** и займемся их предобработкой

In [27]:
mentions.describe() #Наблюдаем выброс в виде issuerid == -3

Unnamed: 0,ChannelID,messageid,issuerid
count,21915.0,21915.0,21915.0
mean,1201366000.0,48830.28971,125.302441
std,107964400.0,77917.270287,78.205528
min,0.0,0.0,-3.0
25%,1138795000.0,4837.0,53.0
50%,1203561000.0,12390.0,115.0
75%,1219485000.0,48683.5,197.0
max,1868097000.0,278484.0,274.0


In [28]:
mentions_texts.describe() #Наблюдаем выброс в виде issuerid == -2

Unnamed: 0,ChannelID,messageid,issuerid,MessageID
count,19355.0,19355.0,19355.0,19355.0
mean,1200943000.0,52060.347507,124.081839,52060.347507
std,97814320.0,81240.604989,78.133251,81240.604989
min,1001030000.0,5.0,-2.0,5.0
25%,1168461000.0,4943.0,53.0,4943.0
50%,1203561000.0,13114.0,115.0,13114.0
75%,1203561000.0,52838.0,190.0,52838.0
max,1565800000.0,278484.0,274.0,278484.0


In [29]:
mentions_texts[mentions_texts.issuerid <= 0]

Unnamed: 0,ChannelID,messageid,issuerid,MessageID,DateAdded,DatePosted,MessageText,IsForward
15771,1203560567,2275,-2,2275,2021-02-06 01:47:00,2017-12-20 07:20:09,PSBR S&P Global Ratings оценило в 93 млрд руб....,False


In [30]:
mentions[mentions.issuerid <= 0]

Unnamed: 0,ChannelID,messageid,issuerid
11217,1280537383,18186,-3
16881,1203560567,2275,-2


In [31]:
mentions_texts = mentions_texts[mentions_texts.issuerid > 0] #Удаление записей с выбросами
mentions = mentions[mentions.issuerid > 0]

Количество записей в датасетах не совпадает, значит, для некоторых **messageid** имеем несколько **issuerid**, т.е. упоминание  
нескольких компаний в одном сообщении

In [32]:
mentions_texts = mentions_texts[['messageid', 'issuerid', 'MessageText']]

In [33]:
mentions = mentions[['messageid', 'issuerid']].groupby('messageid', as_index=False).agg({'issuerid': list})

In [34]:
mentions_texts = mentions_texts.drop_duplicates(subset=['MessageText'])

In [35]:
mentions_full_data = pd.merge(mentions, mentions_texts[['messageid', 'MessageText']],
                              on='messageid',
                              how='right')

In [36]:
dict_eng_words = [] #Сделаем словарь, в котором будут лежать названия компаний на латинице и которые надо оставить при форматировании текстов
for values in all_names_dict.values():
    for item in values:
        english_words = re.findall(r'\b[a-zA-Z]{5,}\b', item)
        dict_eng_words.extend(english_words)

In [37]:
latin_comps_mentions = [] #Начнем удалять из корпуса текстов MessageText названия компаний на английском языке, которые точно не являются искомыми

for text in mentions_full_data.MessageText:
    latin_words = re.findall(r'\b[a-zA-Z]{5,}\b', text) #находим все компании на английском/транслите, длиной > 4 символов
    
    latin_comps_mentions.extend(latin_words)
    

In [38]:
for latin_word in latin_comps_mentions:
    if latin_word.lower() in dict_eng_words:
        latin_comps_mentions.remove(latin_word)

In [39]:
latin_comps_mentions = list(set(latin_comps_mentions))

In [41]:
most_common_tokens = [
    'млрд', 
    'млн', 
    'трлн',
    'руб',
    'года',
    'компании', 
    'компания', 
    'компаний', 
    'акции', 
    'акций',
    'рублей',
    'год',
    'на',
    'году',
    'по',
    'сша',
    'результаты',
    'мсфо',
    'нефть', 
    'нефти',
    'ru',
    'директоров', 
    'Директоров'
    'при',
    'система',
    '30мск', 
    '00мск',
    'совет', 
    'акционеров', ''
    'ak47pfl',
    'если',
    'вопроc',
    'новости',
    'подробнее',
    'российских',
    'рамках',
    'ipo',
    'взгляд',
    'рф',
    'россии', 
    'россия', 
    'россие',
    'не',
    'дня',
    'дивиденды',
    'отчетность', 
    'отчётность',
    'биржа',
    'бирже',
    'биржи',
    'кв',
    'мск',
    'гг',
    'фн',
    'upd'    
]

In [42]:
not_company_name_natasha = [
    'лонг', 
    'мира', 
    'горячую', 
    'казначейство', 
    'распадской', 
    'детского', 
    'ведомости', 
    'акционеры', 
    'шорт', 
    'десятку', 
    'акция'
    'цб',
    'мосбиржи', 
    'мосбиржа',
    'банка',
    'сд',
    'компания',
    'совет',
    'система',
    'прайм',
    'ао',
    'рао',
    'московской',
    'тасс',
    'фрс',
    'минфин',
    'группы',
    'биржа',
    'бирже',
    'биржи',
    'en',
    'фас',
    'интерфакс',
    'за',
    'выручка',
    'не',
    'менеджмент',
    'инвестиции', 
    'инвестиций',
    'сми',
    'флэт',
    'фонд',
    'топ',
    'новости',
    'боковик',
    'минфина',
    'системы',
    'центр',
    'фн',
    'сегодня',
    'рекомендовал',
    'прогноз',
    'сигналов',
    'от',
    'размере',
    'риа',
    'объем',
    'аутсайдеры',
    'отчет',
    'правления',
    'московский',
    'служба',
    'технический',
    'приказ',
    'интерфакса',
    'распадской',
    'арбитражный',
    'до'
]

In [46]:
def message_text_preprocess(text): #Функция для предобработки текста
    remove_punctuation = '!"#$%&\'*+,./:;<=>?@[\\]^_`{|}~``🇷🇺'')('
    text = re.sub(r'\s+', ' ', text) #Заменяем последовательности пробелов и других пробельных символов на один пробел
    text = re.sub(r'(?<=[^\w\d])-|-(?=[^\w\d])|[^\w\d\s-]', '', text) #Удаляем все символы, кроме букв, цифр, пробелов и дефисов
    text = re.sub(r'\+\d{1,2}\s\(\d{3}\)\s\d{3}-\d{2}-\d{2}', '', text) #удаляем телефонные номера
    text = re.sub(r'https?://\S+', '', text) #удаляем ссылки
    text = re.sub(r'\s+', ' ', text) #еще раз удаляем пробелы, если таковые остались
    text = re.sub('•', '', text)
    text = re.sub("''", '', text)
    text = re.sub(r'[«»]', '', text)
    text = re.sub(r'\d+', '', text) #удаляем все цифры
    text = re.sub(r'\b\w\b', '', text) #удаляем одиночные символы

    
    tokens = word_tokenize(text) #Токенизируем текст
    filtered_tokens = [token for token in tokens if token not in remove_punctuation and token not in russian_stopwords and token.lower() not in latin_comps_mentions and token.lower() not in most_common_tokens]
    
    return " ".join(filtered_tokens)

In [47]:
%%time
mentions_full_data['MessageText'] = mentions_full_data['MessageText'].apply(message_text_preprocess)

CPU times: total: 1min 49s
Wall time: 1min 50s


Блок для выявления самых часто встречаемых слов, от которых можно избавиться. Добавляются в список most_common_tokens

In [48]:
mes_text_corpus = []
for value_list in mentions_full_data.MessageText:
    tokens = word_tokenize(value_list)
    mes_text_corpus.extend(tokens)

In [49]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(mes_text_corpus)
vocabulary = vectorizer.get_feature_names_out()

In [50]:
token_counts = X.sum(axis=0)

In [51]:
token_freq = {token: count for token, count in zip(vocabulary, token_counts.tolist()[0])}

In [52]:
sorted_tokens = sorted(token_freq.items(), key=lambda x: x[1], reverse=True)
top_tokens = sorted_tokens[:200]  # Первые 10 самых частых токенов

# Идея:  
• с помощью библиотеки natasha извлекаем названия компаний из корпуса текстов (ограничения для ускорения алгоритма: извлекаем максимум 3 именованные сущности, от каждой оставляем первые 10 симоволов  
• на словаре из всех названий компаний и общего корпуса, извлеченного natasha'ей обучаем tf-idf  
• после этого для предсказания id упоминаемоей компании берем то, что извлекла natasha, векторизуем и сравниваем с векторами из словаря методом косинусного сходства. id компании с топ-1 скором извлекаем сразу, а если разница между топ-1 и топ-2 не более 0.05, то извлекаем id обеих компаний

In [53]:
def fill_natasha_mentions(text):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_ner(ner_tagger) 
    orgs = []
    for item in doc.spans:
        if len(orgs) >= 3: #Если уже найдено 3 организации, прекращаем извлечение
            break
        if item.type == 'ORG' and item.text.lower() not in not_company_name_natasha:
            orgs.append(item.text[:10])
    return orgs


In [54]:
%%time
mentions_full_data['natasha_mentions'] = mentions_full_data['MessageText'].apply(fill_natasha_mentions)

CPU times: total: 5min 2s
Wall time: 1min 16s


In [55]:
mes_natasha_corpus = [] #ищем топ самых встречаемых слов по результатам работы natasha, которые точно не являются компанией, кладем в not_company_name_natasha
for value_list in mentions_full_data.natasha_mentions:
    text = ' '.join(value_list)  #Объединяем значения в одну строку
    tokens = word_tokenize(text)  #Токенизируем строку
    mes_natasha_corpus.extend(tokens) 

In [56]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(mes_natasha_corpus)
vocabulary = vectorizer.get_feature_names_out()

token_counts = X.sum(axis=0)
token_freq = {token: count for token, count in zip(vocabulary, token_counts.tolist()[0])}

In [57]:
sorted_tokens = sorted(token_freq.items(), key=lambda x: x[1], reverse=True)
top_tokens = sorted_tokens[:400]  # Первые топ __ самых частых токенов

# -------------------------------------

In [58]:
corpus_tokens = []
for value_list in all_names_dict.values(): #Проходим по всем значениям в словаре all_names_dict
    text = ' '.join(value_list) #Объединяем значения в одну строку
    tokens = word_tokenize(text) #Токенизируем строку
    corpus_tokens.extend(tokens) #Добавляем токены в список корпуса

In [59]:
remove_punctuation = '!"#$%&\'*+,./:;<=>?@[\\]''^_`{|}~``🇷🇺'')('

In [60]:
corpus_tokens = [word for word in corpus_tokens if word.lower() not in remove_punctuation]

In [61]:
corpus_tokens = [token.replace("'", "").replace('"', '') for token in corpus_tokens]

In [62]:
corpus_tokens_natasha = []
for mention_list in mentions_full_data['natasha_mentions']:
    text = ' '.join(mention_list) #Объединяем значения в одну строку
    tokens = word_tokenize(text) #Токенизируем строку
    corpus_tokens_natasha.extend(tokens) #Добавляем токены в список корпуса

In [63]:
#corpus_tokens_natasha = [word for word in corpus_tokens_natasha if word.lower() not in remove_punctuation]
corpus_tokens_natasha = [token.replace("'", "").replace('"', '') for token in corpus_tokens_natasha]

In [64]:
model_tfidf = TfidfVectorizer(max_features=1087) 
model_tfidf.fit_transform(corpus_tokens)
model_tfidf.fit(corpus_tokens_natasha)


In [99]:
def find_best_companies(model_tfidf, mentions, all_names_dict): #функция для алгоритмического предсказания упоминания компании в тексте
    if len(mentions) == 0: #для случая отсутствия выявленных сущностей в тексте
        return 0
    natasha_vector = model_tfidf.transform([' '.join(mentions)]) #Преобразуем упоминания Natasha и компаний в вектора TF-IDF
    company_vectors = model_tfidf.transform([' '.join(company_names) for company_names in all_names_dict.values()])
    
    similarities = cosine_similarity(natasha_vector, company_vectors) #Находим косинусное расстояние между упоминаниями и компаниями

    best_company_id = similarities.argsort(axis=1)[:, -1] + 1  #компания с наивысшим скором
    second_best_company_id = similarities.argsort(axis=1)[:, -2] + 1  #компания со вторым по величине скором

    if similarities.max(axis=1)[0] - np.partition(similarities, -2, axis=1)[:, -2][0] > 0.05: #условие разницы топ-1 и топ-2 скора
        return best_company_id
    else:
        return [best_company_id[0], second_best_company_id[0]]


In [66]:
%%time
mentions_full_data['predict'] = mentions_full_data['natasha_mentions'].apply(lambda x: find_best_companies(model_tfidf, x, all_names_dict))

CPU times: total: 1min 11s
Wall time: 1min 11s


In [67]:
find_best_companies(model_tfidf, mentions_full_data.natasha_mentions[0], all_names_dict)

90

# Определение тональности

In [83]:
sentiment_texts = sentiment_texts[['issuerid', 'SentimentScore', 'MessageText']]

In [84]:
sentiment_texts.SentimentScore.unique() #удалим наблюдения, где сентимент==0

array([2, 4, 5, 3, 0, 1], dtype=int64)

In [85]:
sentiment_texts = sentiment_texts[sentiment_texts.SentimentScore != 0]

In [86]:
sentiment_texts = sentiment_texts.groupby('MessageText', as_index=False).agg({'issuerid': list, 'SentimentScore': list}) #Группируем по одинаковым текстам

In [87]:
%%time
sentiment_texts.MessageText = sentiment_texts.MessageText.apply(message_text_preprocess) #Предобрабатываем текст

CPU times: total: 45.5 s
Wall time: 45.5 s


In [88]:
%%time
sentiment_texts['natasha_mentions'] = sentiment_texts['MessageText'].apply(fill_natasha_mentions) #Выявляем именованные сущности

CPU times: total: 1min 47s
Wall time: 27 s


In [89]:
sentiment_texts['predict_id'] = sentiment_texts['natasha_mentions'].apply(lambda x: find_best_companies(model_tfidf, x, all_names_dict)) #Предсказываем id

In [108]:
sentiment_texts_

Unnamed: 0,MessageText,issuerid,SentimentScore,natasha_mentions,predict_id
0,,235,4,[],0
1,вводят санкции против банков Московский кредит...,100,2,"[Московский, ВПК, ТАСС TCSG ]",207
1,вводят санкции против банков Московский кредит...,100,2,"[Московский, ВПК, ТАСС TCSG ]",137
2,АКРОН БУДЕТ ОСПАРИВАТЬ РЕШЕНИЕ ПОЛЬШИ АКЦИЯМ A...,24,2,"[АКРОН БУДЕ, МЕЖДУНАРОД, НЕЗАКОННЫМ]",24
3,АЛРОСА выплатить итогам первого полугодия ситу...,4,4,[АЛРОСА],27
...,...,...,...,...,...
7033,компанию Мечел эффект отмены экспортных пошлин...,99,4,"[Мечел, Мечела, Мечел]",99
7034,компанию Мечел - долгожданный разворот Дмитрий...,99,5,"[Мечел, Мечел, Мечел]",99
7035,Мечел потенциалом роста свыше конца итогам дек...,99,4,"[Мечел, Мечел, Мечела]",99
7036,Мечел анализ ключевых БКС Мы по-прежнему счита...,99,4,"[Мечел, БКС, Мечел]",99


In [104]:
sentiment_texts_ = sentiment_texts.explode('issuerid')

In [105]:
sentiment_texts_ = sentiment_texts_.explode('predict_id')

In [106]:
sentiment_texts_ = sentiment_texts_.explode('SentimentScore')

# Бейзлайн - tf-idf + Random Forest

In [109]:
train, test = train_test_split(sentiment_texts_, test_size=0.2, random_state=42)
test, valid = train_test_split(test, test_size=0.5, random_state=42)

In [110]:
train_tfidf = model_tfidf.fit_transform(train['MessageText'].values)
test_tfidf = model_tfidf.transform(test['MessageText'].values)
valid_tfidf = model_tfidf.transform(valid['MessageText'].values)

In [111]:
%%time
clf = RandomForestClassifier(random_state=42)
clf.fit(train_tfidf, train['SentimentScore'].astype(int).values)

CPU times: total: 2min 15s
Wall time: 2min 15s


In [112]:
%%time
predictions = clf.predict(test_tfidf)
accuracy_score(predictions, test['SentimentScore'].astype(int).values)

CPU times: total: 125 ms
Wall time: 118 ms


0.7994438651372958

In [113]:
dump(clf, 'RF_clf.joblib') 

['RF_clf.joblib']

# Catboost (градиентный бустинг)

In [216]:
train_pool = catboost.Pool(train_tfidf, label=train['SentimentScore'].values)
test_pool = catboost.Pool(test_tfidf, label=test['SentimentScore'].values)
valid_pool = catboost.Pool(valid_tfidf, label=valid['SentimentScore'].values)


In [217]:
model_catboost = catboost.CatBoostClassifier(random_state=42, iterations=1500, learning_rate=0.0999, 
                                             depth=5, l2_leaf_reg=0.11, eval_metric='Accuracy', colsample_bylevel=0.1308)
model_catboost.fit(train_pool,
                  eval_set = test_pool,
                  verbose=100,
                  early_stopping_rounds=100)

0:	learn: 0.5733959	test: 0.5672576	best: 0.5672576 (0)	total: 225ms	remaining: 5m 36s
100:	learn: 0.7269212	test: 0.7278415	best: 0.7292318 (95)	total: 7.57s	remaining: 1m 44s
200:	learn: 0.7651505	test: 0.7518248	best: 0.7542579 (195)	total: 14.8s	remaining: 1m 35s
300:	learn: 0.7864373	test: 0.7643379	best: 0.7664234 (255)	total: 22.2s	remaining: 1m 28s
400:	learn: 0.7992962	test: 0.7737226	best: 0.7747654 (391)	total: 29.7s	remaining: 1m 21s
500:	learn: 0.8079847	test: 0.7803267	best: 0.7806743 (495)	total: 37s	remaining: 1m 13s
600:	learn: 0.8157609	test: 0.7817171	best: 0.7834550 (547)	total: 44.3s	remaining: 1m 6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7834549878
bestIteration = 547

Shrink model to first 548 iterations.


<catboost.core.CatBoostClassifier at 0x1a1c010bc90>

In [219]:
accuracy_score(valid['SentimentScore'].astype(int).values, model_catboost.predict(valid_tfidf))

0.7748436414176512

# Попробуем потюнить гиперпараметры catboost с помощью optuna

In [220]:
def objective_catboost(trial):

    params = {
         'iterations': trial.suggest_int('iterations', 500, 2000, step=200),
         'depth': trial.suggest_int('depth', 3, 6),
         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2 , step=0.01),
         'auto_class_weights': 'SqrtBalanced',
         'eval_metric': "Accuracy",
         'loss_function': 'MultiClass',
         'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-2, 1,log=True),
         'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.1, 1.0),
         'random_seed': 42
    }


    clf_catboost = catboost.CatBoostClassifier(**params)
    clf_catboost.fit(train_pool,
                      eval_set = test_pool, plot=False, verbose=100,
                    early_stopping_rounds=100)
 
    return accuracy_score(valid['SentimentScore'].astype(int).values, clf_catboost.predict(valid_tfidf))

In [221]:
study_catboost = optuna.create_study(study_name='catboost-seed42',
                                direction='maximize')

[I 2024-04-14 05:52:15,539] A new study created in memory with name: catboost-seed42


In [222]:
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_catboost.optimize(objective_catboost, n_trials=10,show_progress_bar=True)

  0%|          | 0/10 [00:00<?, ?it/s]



0:	learn: 0.5001306	test: 0.4783352	best: 0.4783352 (0)	total: 215ms	remaining: 3m 56s
100:	learn: 0.5614815	test: 0.5495169	best: 0.5502590 (98)	total: 24.6s	remaining: 4m 3s
200:	learn: 0.5983177	test: 0.5872105	best: 0.5872105 (200)	total: 49s	remaining: 3m 39s
300:	learn: 0.6243811	test: 0.6201178	best: 0.6206276 (298)	total: 1m 13s	remaining: 3m 16s
400:	learn: 0.6499665	test: 0.6422530	best: 0.6425706 (364)	total: 1m 40s	remaining: 2m 55s
500:	learn: 0.6543080	test: 0.6439926	best: 0.6445653 (491)	total: 2m 8s	remaining: 2m 33s
600:	learn: 0.6647748	test: 0.6520296	best: 0.6523549 (595)	total: 2m 33s	remaining: 2m 7s
700:	learn: 0.6724939	test: 0.6579511	best: 0.6579511 (684)	total: 2m 59s	remaining: 1m 42s
800:	learn: 0.6784593	test: 0.6644937	best: 0.6645717 (760)	total: 3m 24s	remaining: 1m 16s
900:	learn: 0.6883145	test: 0.6778857	best: 0.6781330 (892)	total: 3m 48s	remaining: 50.5s
1000:	learn: 0.6951881	test: 0.6837259	best: 0.6837259 (998)	total: 4m 14s	remaining: 25.1s
10



0:	learn: 0.5076179	test: 0.4922546	best: 0.4922546 (0)	total: 147ms	remaining: 2m 41s
100:	learn: 0.7252107	test: 0.7028519	best: 0.7040753 (96)	total: 23.8s	remaining: 3m 55s
200:	learn: 0.7719510	test: 0.7296778	best: 0.7306538 (199)	total: 45.1s	remaining: 3m 21s
300:	learn: 0.7970614	test: 0.7391236	best: 0.7391236 (296)	total: 1m 6s	remaining: 2m 56s
400:	learn: 0.8098321	test: 0.7408820	best: 0.7470831 (367)	total: 1m 27s	remaining: 2m 33s
500:	learn: 0.8215744	test: 0.7502373	best: 0.7511855 (494)	total: 1m 49s	remaining: 2m 10s
600:	learn: 0.8299044	test: 0.7525435	best: 0.7539787 (560)	total: 2m 10s	remaining: 1m 48s
700:	learn: 0.8362632	test: 0.7569006	best: 0.7578272 (668)	total: 2m 32s	remaining: 1m 26s
800:	learn: 0.8402210	test: 0.7556851	best: 0.7584477 (746)	total: 2m 53s	remaining: 1m 4s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7584477436
bestIteration = 746

Shrink model to first 747 iterations.




0:	learn: 0.4993854	test: 0.4793577	best: 0.4793577 (0)	total: 68.5ms	remaining: 2m 9s
100:	learn: 0.6419032	test: 0.6377060	best: 0.6377060 (100)	total: 6.81s	remaining: 2m 1s
200:	learn: 0.6821897	test: 0.6708889	best: 0.6708889 (200)	total: 13.4s	remaining: 1m 53s
300:	learn: 0.7034279	test: 0.6927091	best: 0.6950227 (283)	total: 20.8s	remaining: 1m 50s
400:	learn: 0.7233298	test: 0.7045264	best: 0.7045264 (400)	total: 27.4s	remaining: 1m 42s
500:	learn: 0.7370714	test: 0.7114952	best: 0.7117426 (497)	total: 33.7s	remaining: 1m 34s
600:	learn: 0.7484052	test: 0.7185641	best: 0.7196543 (594)	total: 39.8s	remaining: 1m 26s
700:	learn: 0.7591119	test: 0.7268940	best: 0.7271414 (692)	total: 46s	remaining: 1m 18s
800:	learn: 0.7687927	test: 0.7292860	best: 0.7301911 (794)	total: 52.6s	remaining: 1m 12s
900:	learn: 0.7759285	test: 0.7316013	best: 0.7316793 (860)	total: 59.6s	remaining: 1m 6s
1000:	learn: 0.7820000	test: 0.7346640	best: 0.7346856 (983)	total: 1m 6s	remaining: 60s
1100:	lea



0:	learn: 0.4844840	test: 0.4715607	best: 0.4715607 (0)	total: 85.4ms	remaining: 1m 50s
100:	learn: 0.6824615	test: 0.6794323	best: 0.6794323 (100)	total: 16.1s	remaining: 3m 10s
200:	learn: 0.7298625	test: 0.7097011	best: 0.7124744 (194)	total: 33.4s	remaining: 3m 2s
300:	learn: 0.7569110	test: 0.7242154	best: 0.7242319 (267)	total: 51.2s	remaining: 2m 49s
400:	learn: 0.7756952	test: 0.7265394	best: 0.7322430 (354)	total: 1m 8s	remaining: 2m 33s
500:	learn: 0.7878435	test: 0.7354554	best: 0.7365728 (428)	total: 1m 26s	remaining: 2m 17s
600:	learn: 0.7994708	test: 0.7385537	best: 0.7438625 (574)	total: 1m 43s	remaining: 2m
700:	learn: 0.8091197	test: 0.7479283	best: 0.7480842 (690)	total: 2m 2s	remaining: 1m 44s
800:	learn: 0.8169065	test: 0.7479633	best: 0.7495970 (785)	total: 2m 20s	remaining: 1m 27s
900:	learn: 0.8231049	test: 0.7506429	best: 0.7513221 (896)	total: 2m 37s	remaining: 1m 9s
1000:	learn: 0.8287445	test: 0.7513356	best: 0.7523116 (972)	total: 2m 54s	remaining: 52s
1100:



0:	learn: 0.4777310	test: 0.4629500	best: 0.4629500 (0)	total: 59.6ms	remaining: 1m 41s
100:	learn: 0.6914280	test: 0.6697465	best: 0.6726950 (98)	total: 6.24s	remaining: 1m 38s
200:	learn: 0.7335890	test: 0.7065198	best: 0.7078501 (176)	total: 12.1s	remaining: 1m 30s
300:	learn: 0.7599923	test: 0.7296911	best: 0.7296911 (300)	total: 17.8s	remaining: 1m 22s
400:	learn: 0.7760616	test: 0.7242300	best: 0.7306957 (303)	total: 23.8s	remaining: 1m 17s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7306956963
bestIteration = 303

Shrink model to first 304 iterations.




0:	learn: 0.4703165	test: 0.4610996	best: 0.4610996 (0)	total: 44.3ms	remaining: 1m 24s
100:	learn: 0.5955340	test: 0.5871799	best: 0.5876816 (96)	total: 4.8s	remaining: 1m 25s
200:	learn: 0.6419540	test: 0.6437445	best: 0.6437445 (199)	total: 10.1s	remaining: 1m 25s
300:	learn: 0.6623049	test: 0.6517967	best: 0.6519246 (296)	total: 15.3s	remaining: 1m 21s
400:	learn: 0.6823839	test: 0.6735754	best: 0.6742330 (399)	total: 20.1s	remaining: 1m 15s
500:	learn: 0.6930340	test: 0.6856851	best: 0.6861582 (492)	total: 24.9s	remaining: 1m 9s
600:	learn: 0.7024612	test: 0.6931236	best: 0.6958597 (586)	total: 29.8s	remaining: 1m 4s
700:	learn: 0.7098353	test: 0.6993683	best: 0.6995721 (695)	total: 34.9s	remaining: 59.7s
800:	learn: 0.7158764	test: 0.7046891	best: 0.7046891 (797)	total: 39.7s	remaining: 54.4s
900:	learn: 0.7267800	test: 0.7074465	best: 0.7079127 (866)	total: 44.5s	remaining: 49.4s
1000:	learn: 0.7329412	test: 0.7103656	best: 0.7111856 (964)	total: 49.2s	remaining: 44.2s
1100:	lea



0:	learn: 0.4919946	test: 0.4759614	best: 0.4759614 (0)	total: 125ms	remaining: 1m 2s
100:	learn: 0.7064500	test: 0.6923211	best: 0.6933821 (98)	total: 12s	remaining: 47.5s
200:	learn: 0.7529236	test: 0.7143593	best: 0.7143593 (200)	total: 23.4s	remaining: 34.8s
300:	learn: 0.7755691	test: 0.7344126	best: 0.7347891 (277)	total: 35s	remaining: 23.2s
400:	learn: 0.7887572	test: 0.7435863	best: 0.7435863 (399)	total: 46.5s	remaining: 11.5s
499:	learn: 0.7995589	test: 0.7421718	best: 0.7488047 (486)	total: 57.8s	remaining: 0us

bestTest = 0.7488047402
bestIteration = 486

Shrink model to first 487 iterations.




0:	learn: 0.4847216	test: 0.4682424	best: 0.4682424 (0)	total: 72.9ms	remaining: 51s
100:	learn: 0.6629730	test: 0.6566255	best: 0.6566255 (100)	total: 9.48s	remaining: 56.2s
200:	learn: 0.7037670	test: 0.6875957	best: 0.6882534 (198)	total: 18.7s	remaining: 46.5s
300:	learn: 0.7281873	test: 0.7054898	best: 0.7058931 (290)	total: 27.9s	remaining: 37s
400:	learn: 0.7478171	test: 0.7141805	best: 0.7148735 (370)	total: 37.2s	remaining: 27.8s
500:	learn: 0.7603097	test: 0.7210600	best: 0.7213853 (497)	total: 46.4s	remaining: 18.4s
600:	learn: 0.7704588	test: 0.7295289	best: 0.7295289 (581)	total: 55.9s	remaining: 9.21s
699:	learn: 0.7793817	test: 0.7325067	best: 0.7328035 (664)	total: 1m 5s	remaining: 0us

bestTest = 0.7328034644
bestIteration = 664

Shrink model to first 665 iterations.




0:	learn: 0.4995109	test: 0.4926757	best: 0.4926757 (0)	total: 680ms	remaining: 16m 59s
100:	learn: 0.7375279	test: 0.7077130	best: 0.7077130 (100)	total: 37.5s	remaining: 8m 40s
200:	learn: 0.7802135	test: 0.7348476	best: 0.7350036 (195)	total: 1m 13s	remaining: 7m 52s
300:	learn: 0.8015806	test: 0.7435920	best: 0.7445966 (299)	total: 1m 48s	remaining: 7m 12s
400:	learn: 0.8141132	test: 0.7474748	best: 0.7490935 (382)	total: 2m 24s	remaining: 6m 36s
500:	learn: 0.8243012	test: 0.7514291	best: 0.7521858 (431)	total: 2m 59s	remaining: 5m 58s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7521858001
bestIteration = 431

Shrink model to first 432 iterations.




0:	learn: 0.5012050	test: 0.4866801	best: 0.4866801 (0)	total: 338ms	remaining: 7m 18s
100:	learn: 0.7091296	test: 0.6972405	best: 0.6995819 (97)	total: 32.4s	remaining: 6m 25s
200:	learn: 0.7632866	test: 0.7284256	best: 0.7284256 (200)	total: 1m 4s	remaining: 5m 51s
300:	learn: 0.7883671	test: 0.7392175	best: 0.7399531 (292)	total: 1m 36s	remaining: 5m 18s
400:	learn: 0.8086442	test: 0.7463711	best: 0.7463711 (400)	total: 2m 7s	remaining: 4m 45s
500:	learn: 0.8213135	test: 0.7502744	best: 0.7525873 (479)	total: 2m 39s	remaining: 4m 13s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7525873244
bestIteration = 479

Shrink model to first 480 iterations.


In [132]:
study_catboost.best_params #Параметры лучшей модели

{'iterations': 500,
 'depth': 5,
 'learning_rate': 0.12,
 'l2_leaf_reg': 0.16960494498640707,
 'colsample_bylevel': 0.23613115193045567}

In [133]:
best_cb_model = catboost.CatBoostClassifier(**study_catboost.best_params) #лучшая модель

In [134]:
best_cb_model.fit(train_pool,
                  eval_set = test_pool,
                  verbose=100,
                  early_stopping_rounds=100)

0:	learn: 1.6741021	test: 1.6680147	best: 1.6680147 (0)	total: 120ms	remaining: 59.7s
100:	learn: 0.9673294	test: 0.9834026	best: 0.9832746 (99)	total: 17.7s	remaining: 1m 10s
200:	learn: 0.8446145	test: 0.9222759	best: 0.9222355 (199)	total: 35.6s	remaining: 52.9s
300:	learn: 0.7704406	test: 0.8988053	best: 0.8985376 (299)	total: 54.3s	remaining: 35.9s
400:	learn: 0.7148313	test: 0.8798525	best: 0.8798525 (400)	total: 1m 13s	remaining: 18.1s
499:	learn: 0.6717635	test: 0.8729500	best: 0.8717626 (475)	total: 1m 32s	remaining: 0us

bestTest = 0.8717625816
bestIteration = 475

Shrink model to first 476 iterations.


<catboost.core.CatBoostClassifier at 0x2b1c1c3b0d0>

In [135]:
best_preds = best_cb_model.predict(valid_tfidf)

In [136]:
accuracy_score(valid['SentimentScore'].values, best_preds)

0.6189451022604952

# Обучение ruBERT-tiny

In [570]:
label2id = {
    'very_negative': 0,
    'negative': 1,
    'neutral': 2,
    'positive': 3,
    'very_positive': 4
}

id2label = {
    0: 'very_negative',
    1: 'negative',
    2: 'neutral',
    3: 'positive',
    4: 'very_positive'
}

In [None]:
sentiment_texts.SentimentScore -= 1 #Вычитаем единицу, т.к. первый класс для rubert-tiny - 0, а не 1

In [232]:
sentiment_texts.SentimentScore.unique()

array([2, 4, 5, 3, 0, 1], dtype=int64)

In [572]:
model = AutoModelForSequenceClassification.from_pretrained(
    "cointegrated/rubert-tiny", num_labels=len(id2label.keys()), label2id=label2id, id2label=id2label
)

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny a

In [573]:
data_list_of_dicts = [] #приводим данные к виду для обучения
for idx, row in sentiment_texts.iterrows():
    text = row['MessageText']
    label = row['SentimentScore']
    data_list_of_dicts.append({'text': text, 'label': label})

In [574]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
def tokenize_data(text):
    return tokenizer(text['text'], padding=True, truncation=True, max_length=256, return_tensors='pt')

In [575]:
shuffle(data_list_of_dicts)
train_bert = data_list_of_dicts[:7300]
test_bert = data_list_of_dicts[7300:]
train_bert = Dataset.from_pandas(pd.DataFrame(data=train_bert))
test_bert = Dataset.from_pandas(pd.DataFrame(data=test_bert))
tokenized_train = train_bert.map(tokenize_data, batched=True)
tokenized_test = test_bert.map(tokenize_data, batched=True)

Map:   0%|          | 0/7300 [00:00<?, ? examples/s]

Map:   0%|          | 0/1826 [00:00<?, ? examples/s]

In [558]:
f1 = evaluate.load("f1")

def compute_metrics(eval_pred): #функция для подсчета метрик
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    f1_score = f1.compute(predictions=predictions, references=labels, average='macro')
    accuracy = accuracy_score(labels, predictions)
    
    return {'f1': f1_score, 'accuracy': accuracy}

In [576]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="akra_model",
    learning_rate=2e-3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=7,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [577]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [588]:
predict(single_test['input_ids'], single_test['token_type_ids'], single_test['attention_mask']).logits.detach().numpy()

array([[-2.829366  , -0.11736619,  0.95662   ,  0.9494317 ,  0.27798185]],
      dtype=float32)