# Импортируем библиотеки


In [1028]:
import pickle
import json
import glob
import re

import pandas as pd

import time
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import f1_score, recall_score, precision_score

from langchain.text_splitter import RecursiveCharacterTextSplitter

from sentence_transformers import SentenceTransformer
from sentence_transformers.models import Pooling, Transformer

import uuid

from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct

from catboost import CatBoostClassifier, Pool

import optuna

pd.options.mode.chained_assignment = None 

Чтение и преобразование размеченных данных в pandas-таблицу

In [895]:
path_to_files = 'gemini_predicted_data/'
file_list = glob.glob(path_to_files + "gemini_preds_*.pickle")

In [896]:
file_list

['gemini_predicted_data\\gemini_preds_1925.pickle',
 'gemini_predicted_data\\gemini_preds_389.pickle',
 'gemini_predicted_data\\gemini_preds_400.pickle',
 'gemini_predicted_data\\gemini_preds_700.pickle',
 'gemini_predicted_data\\gemini_preds_from700_705.pickle',
 'gemini_predicted_data\\gemini_preds_from700_720.pickle',
 'gemini_predicted_data\\gemini_preds_from700_735.pickle',
 'gemini_predicted_data\\gemini_preds_from700_750.pickle',
 'gemini_predicted_data\\gemini_preds_from700_765.pickle',
 'gemini_predicted_data\\gemini_preds_from700_780.pickle',
 'gemini_predicted_data\\gemini_preds_from700_795.pickle',
 'gemini_predicted_data\\gemini_preds_from700_810.pickle',
 'gemini_predicted_data\\gemini_preds_from700_825.pickle',
 'gemini_predicted_data\\gemini_preds_from700_840.pickle',
 'gemini_predicted_data\\gemini_preds_from700_855.pickle',
 'gemini_predicted_data\\gemini_preds_from700_870.pickle',
 'gemini_predicted_data\\gemini_preds_from700_885.pickle',
 'gemini_predicted_data\\gem

In [897]:
df = pd.DataFrame()

for file in file_list:
    with open(file, 'rb') as file:
        data = pickle.load(file)
    
    temp_df = pd.DataFrame(data=data)
    df = pd.concat([df, temp_df], axis=0)

In [898]:
df = df.drop_duplicates(subset=['text', 'gigachat_preds']).rename(columns={'gigachat_preds': 'gemini_preds'})

Предобработка данных

In [899]:
def remove_surrounding_newlines_and_backticks(text):
    """
    Проверяет, начинается ли строка с "\n```" и заканчивается ли на "\n```",
    и если да, удаляет эти последовательности.

    Args:
        text (str): Входная строка.

    Returns:
        str: Строка с удаленными начальными и конечными "\n```", 
             если они присутствовали.
    """
    start_sequence = "```\n"
    end_sequence = "\n```"
    
    second_start_sequence = "```json"
    second_end_sequence = "```"
    
    if text.startswith(start_sequence) and text.endswith(end_sequence):
        return text[len(start_sequence):-len(end_sequence)]
    elif text.startswith(second_start_sequence) and text.endswith(second_end_sequence):
        return text[len(second_start_sequence):-len(second_end_sequence)]
    
    else:
        return text

In [900]:
def remove_all_newlines_and_trailing_spaces(text):
    """
    Удаляет все символы новой строки ("\n") и пробелы в конце строки.

    Args:
        text (str): Входная строка.

    Returns:
        str: Строка без символов новой строки и пробелов в конце.
    """
    text = text.replace("\n", "")  # Удаляем символы новой строки
    return text.rstrip()

In [901]:
def extract_values_from_json_strings(df, column_name):
    """
    Извлекает значения из JSON-строк в указанном столбце DataFrame
    и создает новые колонки для каждого ключа.

    Args:
        df (pd.DataFrame): DataFrame с JSON-строками.
        column_name (str): Название столбца с JSON-строками.
    """
    def parse_json_string(text):
        try:
            text = text.replace("'", '"')  # Заменяем одинарные кавычки на двойные
            data = json.loads(text)
            return data.get('prediction'), data.get('agitation'), data.get('emotions'), data.get('politics')
        except json.JSONDecodeError:
            return None, None, None, None
        
    df[['prediction', 'agitation', 'emotions', 'politics']] = df[column_name].apply(parse_json_string).apply(pd.Series)
    
    return df

In [902]:
def convert_list_to_string(text_list):
    if text_list:  # Проверить, не пустой ли список
        return ' '.join(text_list)
    else:
        return ''  # Вернуть пустую строку для пустых списков

In [903]:
%%time

df['gemini_preds'] = df['gemini_preds'].apply(remove_surrounding_newlines_and_backticks)
df['gemini_preds'] = df['gemini_preds'].apply(remove_all_newlines_and_trailing_spaces)

CPU times: total: 15.6 ms
Wall time: 5 ms


In [904]:
%%time

data = extract_values_from_json_strings(df, 'gemini_preds')

CPU times: total: 297 ms
Wall time: 284 ms


In [905]:
data = data.reset_index(drop=True)

In [906]:
data['agitation'] = data['agitation'].apply(convert_list_to_string)
data['emotions'] = data['emotions'].apply(convert_list_to_string)
data['politics'] = data['politics'].apply(convert_list_to_string)

In [910]:
data

Unnamed: 0,text,gemini_preds,index,prediction,agitation,emotions,politics
0,кб двухбуквенная аббревиатура наиболее вероятн...,"{'prediction': 1, 'agitation': [], 'emotions':...",999,1.0,,,
1,марина орлова орлова марина викторовна российс...,"{'prediction': 1, 'agitation': [], 'emotions':...",1000,1.0,,,
2,золотая серп и молот ссср учреждена указом пре...,"{'prediction': 1, 'agitation': [], 'emotions':...",1001,1.0,,,
3,тбилиси страны стран участников спортсмен меда...,"{'prediction': 1, 'agitation': [], 'emotions':...",1002,1.0,,,
4,портрет п а столыпина картина ильи репина напи...,"{'prediction': 1, 'agitation': [], 'emotions':...",1003,1.0,,,
...,...,...,...,...,...,...,...
1808,город в регион нет поселения поселение поселен...,"{'prediction': 0, 'agitation': [], 'emotions':...",984,0.0,,великолепный мягкие,советская власть гражданская война белая армия...
1809,крестовые походы религиозные военные экспедици...,"{'prediction': 1, 'agitation': [], 'emotions':...",985,1.0,,,
1810,флага герб герба страна испания региона автоно...,"{'prediction': 1, 'agitation': [], 'emotions':...",986,1.0,,,
1811,сингл третий сингл эминема с альбома альбом вт...,"{'prediction': 1, 'agitation': [], 'emotions':...",988,1.0,,,


In [911]:
data.prediction.value_counts() # число записей для каждого класса

1.0    1496
0.0     310
Name: prediction, dtype: int64

In [912]:
data.loc[data.prediction.isna()] = 0

In [913]:
data.prediction = data.prediction.astype(int)

# Готовим данные для векторной БД

In [954]:
#дополнительные данные для разделов "политика" и "эмоции"
with open('emotional_data_list.pickle', 'rb') as file:
    politics_list = pickle.load(file)
    
with open('emotional_words_list.pickle', 'rb') as file:
    emotions_list = pickle.load(file)

In [977]:
politics_list = politics_list[:2000]

In [978]:
emotions_list = emotions_list[:2000]

In [979]:
politics_list_string = ' '.join(politics_list)
emotions_list_string = ' '.join(emotions_list)

In [980]:
def merge_columnText_into_string(df: pd.DataFrame, column_name: str):
    df[column_name] = df[column_name].astype(str)
    merged_string = " ".join(df[column_name].tolist())
    
    return merged_string

In [981]:
def text_to_chunks(text: str, sep, chunk_size: int, chunk_overlap: int):
    # Разбиваем текст на чанки
    text_splitter = RecursiveCharacterTextSplitter(
        separators = sep, # разделитель, в нашем случае - пробел ' '
        chunk_size = chunk_size, # размер чанка
        chunk_overlap = chunk_overlap, # наслаивание чанков. Например, первый чанк - от 0 до 100 символа, 2 чанк от 80 до 180. Наслаивание - 20
        length_function = len,
        is_separator_regex = False,
        add_start_index = False
    )
    chunks = text_splitter.split_text(text)
    
    return chunks

In [982]:
merged_emotions_string = merge_columnText_into_string(data, 'emotions') #объединяем найденные слова-маркеры в одну строку
merged_agitation_string = merge_columnText_into_string(data, 'agitation')
merged_politics_string = merge_columnText_into_string(data, 'politics')

In [999]:
merged_string_PAE = ' '.join([merged_emotions_string, merged_agitation_string, merged_politics_string, emotions_list_string, politics_list_string])

In [1000]:
len(merged_string_PAE)

111827

In [1001]:
def delete_extra_spaces(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    return text

In [1002]:
merged_string_PAE = delete_extra_spaces(merged_string_PAE)

In [1003]:
token_length = list()
for token in merged_string_PAE.split(' '):
    token_length.append(len(token))

In [1013]:
print(f'Средняя длина слова из конкатенированной строки: {np.mean(token_length)}')

Средняя длина слова из конкатенированной строки: 8.211571526822558


In [1005]:
# теперь после разбиения на чанки, на каждый чанк надо навесить эмбединг, чтобы положить получившиеся вектора в векторную БД
# Подгружаем bi-encoder 
def get_bi_encoder(bi_encoder_name):
    raw_model = Transformer(model_name_or_path=f'{bi_encoder_name}')

    # Вытаскиваем размер векторов
    bi_encoder_dim = raw_model.get_word_embedding_dimension()
    
    pooling_model = Pooling(
        bi_encoder_dim,
        pooling_mode_cls_token = False,
        pooling_mode_mean_tokens = True
    )
    bi_encoder = SentenceTransformer(
        modules = [raw_model, pooling_model]
    )
    
    return bi_encoder, bi_encoder_dim

In [1006]:
bi_encoder, bi_encoder_dim = get_bi_encoder('cointegrated/rubert-tiny2')

In [1007]:
# Формируем из строки вектор
def str_to_vec(bi_encoder, text):
    embeddings = bi_encoder.encode(
        text,
        convert_to_tensor = True
    )
    return embeddings

# Непосредственно подходим к созданию векторной БД

In [1024]:
COLL_NAME = 'znanie_hackathon_db'
SEP = ' '
CHUNK_SIZE = 50
CHUNK_OVERLAP = 10

In [1025]:
# Создаем подключение к векторной БД
qdrant_client = QdrantClient(
    url="https://3bff1843-f3d9-4163-9662-c51ae29efadb.europe-west3-0.gcp.cloud.qdrant.io:6333", 
    api_key="vDtHoKJfQgdmmw9RzjEcaJjIsRIlywXo79tE65enlw2WIywzwHw-dA",
    timeout=240
)


In [1016]:
# Помещаем чанки и доп. информаицю в векторую БД
def save_chunks(bi_encoder, chunks):
    # Конвертируем чанки в векитора
    chunk_embeddings = str_to_vec(bi_encoder, chunks)

    # Создаем объект(ы) для БД
    points = []
    for i in range(len(chunk_embeddings)):
        point = PointStruct(
            id=str(uuid.uuid4()), # генерируем GUID
            vector = chunk_embeddings[i].tolist(),
            payload={'chunk': chunks[i]}
        )
        points.append(point)
    
    # Сохраняем вектора в БД
    operation_info = qdrant_client.upsert(
        collection_name = COLL_NAME,
        wait = True,
        points = points
    )
    
    return operation_info

In [1017]:
def texts_to_vecdb(text, bi_encoder, vec_size, sep, chunk_size, chunk_overlap):    
    # Удаляем и заново создаем коллекцию
    qdrant_client.delete_collection(collection_name=COLL_NAME)
    qdrant_client.create_collection(
        collection_name = COLL_NAME,
        vectors_config = VectorParams(size=vec_size, distance=Distance.COSINE), # size=312 (rubert-tiny2)
    )
    

    # делим на чанки ...
    chunks = text_to_chunks(text, sep, chunk_size, chunk_overlap)
    # помещаем чанки в векторную БД
    operation_status = save_chunks(bi_encoder, chunks)
    
    return operation_status

In [1026]:
%%time

texts_to_vecdb(merged_string_PAE, bi_encoder, bi_encoder_dim, SEP, CHUNK_SIZE, CHUNK_OVERLAP)

CPU times: total: 28.8 s
Wall time: 23.3 s


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

# Поиск векторов по косинусной близости

In [475]:
def vec_search(bi_encoder, query, n_top_cos):
    # Кодируем запрос в вектор
    query_emb = str_to_vec(bi_encoder, query).tolist()

    # Поиск в БД
    search_result = qdrant_client.search(
        collection_name = COLL_NAME,
        query_vector = query_emb,
        limit = n_top_cos,
        with_vectors = False
    )
    
    top_chunks = [[x.payload['chunk'], x.score] for x in search_result]
    
    return top_chunks

# Последняя идея - обучить на дополнительных текстовых фичах градиентный бустинг

In [478]:
data = data[['text', 'agitation', 'emotions', 'politics', 'prediction']]

In [480]:
# заполним пустые поля колонок фиктивным словом EMPTY
data['agitation'] = data['agitation'].apply(lambda x: 'EMPTY' if len(x) == 0 else x)
data['emotions'] = data['emotions'].apply(lambda x: 'EMPTY' if len(x) == 0 else x)
data['politics'] = data['politics'].apply(lambda x: 'EMPTY' if len(x) == 0 else x)

In [481]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['prediction']), 
                                                    data['prediction'],
                                                    test_size=0.2,
                                                    random_state=42)

In [824]:
model_tfidf = TfidfVectorizer(max_features=5000)

train_features = model_tfidf.fit_transform(X_train["text"].astype(str) + " " + X_train["agitation"].astype(str) + " " + X_train["emotions"].astype(str) + " " + X_train["politics"].astype(str))
test_features = model_tfidf.transform(X_test["text"].astype(str) + " " + X_test["agitation"].astype(str) + " " + X_test["emotions"].astype(str) + " " + X_test["politics"].astype(str))

In [825]:
test_features

<363x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 49934 stored elements in Compressed Sparse Row format>

In [826]:
train_pool = Pool(
    train_features,
    y_train
)

test_pool = Pool(
    test_features,
    y_test
)

In [827]:
def objective_catboost(trial):

    params = {
         'iterations': trial.suggest_int('iterations', 500, 2000, step=100),
         'depth': trial.suggest_int('depth', 3, 7),
         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1 , step=0.01),
         'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.2, 0.9),
         #'auto_class_weights': 'Balanced',
         'eval_metric': "F1",
         'loss_function': 'Logloss',
         'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-2, 1,log=True),
         'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.1, 1.0),
         'random_seed': 42
    }


    clf_catboost = CatBoostClassifier(**params)
    clf_catboost.fit(train_pool,
                      eval_set = test_pool, plot=False, verbose=False,
                    early_stopping_rounds=100)
 
    return recall_score(y_test, clf_catboost.predict(test_features), pos_label=0)

In [828]:
study_catboost = optuna.create_study(study_name='catboost-seed42',
                                direction='maximize')

In [829]:
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_catboost.optimize(objective_catboost, n_trials=100,show_progress_bar=True)

  0%|          | 0/100 [00:00<?, ?it/s]

In [1036]:
study_catboost.best_params

{'iterations': 1600,
 'depth': 4,
 'learning_rate': 0.09,
 'scale_pos_weight': 0.2324232236144656,
 'l2_leaf_reg': 0.06833908762823766,
 'colsample_bylevel': 0.9076865755180072}

In [1037]:
clf = CatBoostClassifier(**study_catboost.best_params)

In [1038]:
clf.fit(train_pool, eval_set=test_pool, plot=False, verbose=False, early_stopping_rounds=100)

<catboost.core.CatBoostClassifier at 0x228d5721b90>

In [1046]:
preds

array([0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [1051]:
recall_0 = recall_score(y_test, preds, pos_label=0)
precision_score_0 = precision_score(y_test, preds, pos_label=0)
f1_score_value = f1_score(y_test, preds.astype(int), average='weighted')

print(f"Recall для класса 0: {recall_0}")
print(f"Precision для класса 0: {precision_score_0}")
print(f"F1-score: {f1_score_value}")

Recall для класса 0: 0.828125
Precision для класса 0: 0.6309523809523809
F1-score: 0.8901134827526775


# Обучение ruBERT-tiny

In [865]:
label2id = {
    'neutral': 1,
    'not_neutral': 0
}

id2label = {
    1: 'neutral',
    0: 'not_neutral'
}

In [866]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    "cointegrated/rubert-tiny", num_labels=len(id2label.keys()), id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [867]:
data_list_of_dicts = []
for idx, row in data.iterrows():
    text = row['text']
    label = row['prediction']
    data_list_of_dicts.append({'text': str(text), 'label': label})


In [868]:
len(data_list_of_dicts)

1813

In [869]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
def token(text):
    return tokenizer(text['text'], padding=True, truncation=True, max_length=512, return_tensors='pt')

In [870]:
from datasets import Dataset
from random import shuffle


shuffle(data_list_of_dicts)
train = data_list_of_dicts[:1400]
test = data_list_of_dicts[1400:]
train = Dataset.from_pandas(pd.DataFrame(data=train))
test = Dataset.from_pandas(pd.DataFrame(data=test))
tokenized_train = train.map(token, batched=True)
tokenized_test = test.map(token, batched=True)

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Map:   0%|          | 0/413 [00:00<?, ? examples/s]

In [871]:
tokenized_test

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 413
})

In [1049]:
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    # Вычисляем взвешенную F1
    f1 = f1_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, pos_label=0)
    return {'f1': f1, 'recall': recall}  # Возвращаем словарь с метрикой

In [873]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
training_args = TrainingArguments(
    output_dir="akra_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=7,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=weighted_loss,
)

In [874]:
trainer.train()

Epoch,Training Loss,Validation Loss,Recall
1,No log,0.393821,0.236533
2,No log,0.33736,0.226946
3,No log,0.332532,0.246556
4,No log,0.316627,0.201663
5,No log,0.317326,0.178307
6,0.348700,0.314068,0.198062
7,0.348700,0.313896,0.195288


TrainOutput(global_step=616, training_loss=0.3357907332383193, metrics={'train_runtime': 3866.6681, 'train_samples_per_second': 2.534, 'train_steps_per_second': 0.159, 'total_flos': 72267228364800.0, 'train_loss': 0.3357907332383193, 'epoch': 7.0})

In [802]:
with open('ruBERT_tiny_model_2.pickle', 'wb') as file:
    pickle.dump(model, file)

In [875]:
predictions = trainer.predict(tokenized_test)

In [864]:
predicted_labels = predictions.predictions.argmax(axis=1)
true_labels = tokenized_test['label']

In [1052]:
f1_score(predicted_labels, true_labels, average='weighted')

0.903054448871182