In [113]:
import pandas as pd
import numpy as np
import json
import torch
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier, Pool
from matplotlib import pyplot as plt

from transformers import AutoModelForCausalLM, AutoTokenizer

tqdm.pandas()

%matplotlib inline

In [2]:
def load_train_data(data_file: str, labels_file: str):
    all_texts = []
    all_labels = []
    all_part_indicies = []

    labels_df = pd.read_csv(labels_file)
    labels_df = labels_df[labels_df["participant_index"] == 0]
    labels_dict = dict(zip(labels_df["dialog_id"], labels_df["is_bot"]))

    with open(data_file, "r", encoding="utf-8") as f:

        data = json.load(f)
        for key in data.keys():
            messages = data[key]

            part_0_texts = [
                m["text"] for m in messages if m["participant_index"] == "0"
            ]
            part_1_texts = [
                m["text"] for m in messages if m["participant_index"] == "1"
            ]

            part_0_label = int(labels_dict[key])
            part_1_label = 1 - part_0_label

            text_0 = " ".join(part_0_texts)
            text_1 = " ".join(part_1_texts)

            all_texts.append(text_0)
            all_labels.append(part_0_label)
            all_part_indicies.append(0)

            all_texts.append(text_1)
            all_labels.append(part_1_label)
            all_part_indicies.append(1)

    df = pd.DataFrame({"text": all_texts, "participant_index": all_part_indicies, "is_bot": all_labels})
    return df


def load_test_data(data_file: str, labels_file: str):
    df_info = pd.read_csv(labels_file)

    with open(data_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    all_texts = []
    ids = []
    part_indicies = []

    for _, row in df_info.iterrows():
        dialog_id = row["dialog_id"]
        participant_index = str(row["participant_index"])
        messages = data[dialog_id]

        texts = [
            m["text"] for m in messages if m["participant_index"] == participant_index
        ]
        combined_text = " ".join(texts)
        all_texts.append(combined_text)
        ids.append(row["ID"])
        part_indicies.append(participant_index)

    df = pd.DataFrame({"ID": ids, "text": all_texts, "participant_index": part_indicies})
    return df

In [3]:
# MODEL = "openai/whisper-large-v3"
MODEL = "openai/whisper-tiny"
# MODEL = "gpt2"
# MODEL = 'cointegrated/rubert-tiny2'

model_p = AutoModelForCausalLM.from_pretrained(
    # "gpt2", #"microsoft/phi-1.5",
    MODEL,
    torch_dtype="auto",
    trust_remote_code=True,
    output_hidden_states=True
)
tokenizer = AutoTokenizer.from_pretrained(
    # "gpt2", "microsoft/phi-1.5",
    MODEL,
    trust_remote_code=True
)

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

2025-05-08 06:21:10.105536: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746685270.333436      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746685270.392922      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

In [4]:
device = "cuda"
# device = "cpu"
model_p = model_p.to(device)

In [5]:
model_p.model.decoder.embed_positions

WhisperPositionalEmbedding(448, 384)

In [6]:
def calculate_perplexity(text, model, tokenizer, device):
    # Токенизация
    encodings = tokenizer(text, return_tensors="pt", truncation=True, max_length=448)
    encodings = {k: v.to(device) for k, v in encodings.items()}

    # Вычисление mean negative log-likelihood (nll) per token
    # (стандартный loss для LLM в библиотеке transformers)
    input_ids = encodings["input_ids"]
    # print(encodings)
    with torch.no_grad():
        outputs = model(**encodings, labels=input_ids)
        neg_log_likelihood = outputs.loss.item()

    # return neg_log_likelihood
    perplexity = np.exp(neg_log_likelihood) # Перплексия вычисляется как e ^ nll
    return perplexity

In [7]:
df = load_train_data(
    "/kaggle/input/you-are-bot/train.json",
    "/kaggle/input/you-are-bot/ytrain.csv"
)

In [8]:
df["perplexity"] = df["text"].progress_apply(lambda x: calculate_perplexity(x, model_p, tokenizer, device))

  0%|          | 0/1572 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
100%|██████████| 1572/1572 [00:07<00:00, 216.70it/s]


In [66]:
df

Unnamed: 0,text,participant_index,is_bot,perplexity
0,Hello! Отлично! А твои? Расскажи теорему,0,0,6.417432e+05
1,Как дела? Это круто!,1,1,1.520366e+06
2,Привет никак оооокккееуу оууукккии оооуууллкке...,0,0,4.418490e+06
3,Привет! Как я могу помочь тебе сегодня? Хорошо...,1,1,3.478797e+05
4,Привет Ты бот?,0,0,9.961852e+05
...,...,...,...,...
1567,"привет, как дела? ну, нормально, а у тебя? хз,...",1,1,3.737608e+05
1568,привет 🇬🇳 АХАХАХХАХА,0,0,2.705776e+05
1569,гойда данил иди нах придурок,1,1,1.959228e+06
1570,где что когда почему 20 04 как где сколько теб...,0,0,2.403816e+06


In [10]:
X = df[["text", "participant_index", "perplexity"]]
y = df["is_bot"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [15]:
df_test = load_test_data(
    "/kaggle/input/you-are-bot/test.json",
    "/kaggle/input/you-are-bot/ytest.csv"
)

In [16]:
df_test["perplexity"] = df_test["text"].progress_apply(lambda x: calculate_perplexity(x, model_p, tokenizer, device))

100%|██████████| 676/676 [00:03<00:00, 211.90it/s]


In [17]:
df_test

Unnamed: 0,ID,text,participant_index,perplexity
0,af36ac2aa9734738bbd533db8e5fb43a_0,Привет привет ты бот? Мне запрещено отвечать н...,0,1.133211e+06
1,cdc2c5c605144c8e8dd5e9ea3d1352fc_0,Hi! ты бот? Понял Вас а Вы?,0,4.378838e+05
2,ed19efdedcb24600aea67c968aba5520_0,"не знаю, я устал ya toze",0,5.222963e+06
3,f2ea031960cf4454b4596d94cbee021e_0,"Присет присед? болгарин? от это совпадение, я ...",0,1.078584e+06
4,d948808cda4944cd838f88308a9ecd8b_0,ты кто? ff5969ad3adb,0,1.050394e+05
...,...,...,...,...
671,23ce3b6cf164467386e2b34db908dbc3_1,meow agh a a a,1,7.405932e+04
672,4dad8117d3c946ef9c021aac9e5ded02_1,Мне плохо Помоги мне преодолеть апатию Как бор...,1,4.934309e+06
673,8e822ce1089741febae586c5fef99124_1,"чел, ты о чём? ээ, ну твое какое? расскажи чут...",1,1.585196e+06
674,56201a8ac9c64665aa6d236dbc79daf4_1,"Привет! О, ну я бы, наверное, просто посмеялся...",1,8.278923e+05


In [98]:
data_prep = Pipeline(
    [
        (
            "vectorizer", ColumnTransformer(
                transformers=[
                    (
                        "text",
                        Pipeline(
                            [
                                ("squeez", FunctionTransformer(lambda x: x.squeeze())),
                                ("tfidf", TfidfVectorizer()),
                                ("toarray", FunctionTransformer(lambda x: x.toarray())),
                            ]
                        ),
                        ["text"]
                    ),
                    # ("identity", FunctionTransformer(lambda x: x), ["text", "participant_index", "perplexity"]),
                    # ("scaller", StandardScaler(), ["perplexity"])
                ]
            )
        ),
        # ("model", LogisticRegression(random_state=42)),
        # ("model", CatBoostClassifier(random_seed=42)),
    ]
)


In [99]:
# train_data = Pool(data_prep.fit_transform(X_train), y_train)
# test_data = Pool(data_prep.transform(X_test), y_test)

train_data = Pool(
    X_train,
    # pd.concat([X_train.reset_index(drop=True), pd.DataFrame(data_prep.fit_transform(X_train))], axis=1),
    y_train,
    text_features=["text"]
)
test_data = Pool(
    X_test,
    # pd.concat([X_test.reset_index(drop=True), pd.DataFrame(data_prep.transform(X_test))], axis=1),
    y_test,
    text_features=["text"]
)

In [100]:
clf = CatBoostClassifier(random_seed=42, task_type="GPU", loss_function="Logloss")

In [101]:
clf.fit(train_data, eval_set=test_data)

Learning rate set to 0.069836
0:	learn: 0.6537557	test: 0.6544881	best: 0.6544881 (0)	total: 142ms	remaining: 2m 21s
1:	learn: 0.6232967	test: 0.6274255	best: 0.6274255 (1)	total: 202ms	remaining: 1m 40s
2:	learn: 0.5930661	test: 0.5983329	best: 0.5983329 (2)	total: 261ms	remaining: 1m 26s
3:	learn: 0.5718347	test: 0.5762123	best: 0.5762123 (3)	total: 313ms	remaining: 1m 17s
4:	learn: 0.5464063	test: 0.5525535	best: 0.5525535 (4)	total: 368ms	remaining: 1m 13s
5:	learn: 0.5273298	test: 0.5346557	best: 0.5346557 (5)	total: 416ms	remaining: 1m 8s
6:	learn: 0.5137225	test: 0.5212469	best: 0.5212469 (6)	total: 454ms	remaining: 1m 4s
7:	learn: 0.5010544	test: 0.5086397	best: 0.5086397 (7)	total: 489ms	remaining: 1m
8:	learn: 0.4904529	test: 0.4971363	best: 0.4971363 (8)	total: 522ms	remaining: 57.5s
9:	learn: 0.4780737	test: 0.4884316	best: 0.4884316 (9)	total: 556ms	remaining: 55s
10:	learn: 0.4691044	test: 0.4793552	best: 0.4793552 (10)	total: 589ms	remaining: 52.9s
11:	learn: 0.4631066	t

<catboost.core.CatBoostClassifier at 0x7d7f5b286f10>

In [102]:
val_pred = clf.predict(test_data)
val_proba = clf.predict_proba(test_data)
val_acc = accuracy_score(y_test, val_pred)
val_roc = roc_auc_score(y_test, val_proba[:, 1])
val_logloss = log_loss(y_test, val_proba)
print("Val Accuracy:", val_acc)
print("Val ROC AUC:", val_roc)
print("Val Log Loss:", val_logloss)

Val Accuracy: 0.834920634920635
Val ROC AUC: 0.8752721115859066
Val Log Loss: 0.418876769473214


In [None]:
# Val Accuracy: 0.819047619047619
# Val ROC AUC: 0.877126501652826
# Val Log Loss: 0.418710731938007

In [85]:
# train_data = Pool(data_prep.fit_transform(X), y)
# test_data = Pool(data_prep.transform(df_test[["text", "participant_index", "perplexity"]]))
# train_data = Pool(X, y, text_features=["text"])
t_data = Pool(df_test[["text", "participant_index", "perplexity"]], text_features=["text"])
# clf = CatBoostClassifier(random_seed=42, task_type="GPU", loss_function="Logloss")
# clf.fit(train_data)
preds = clf.predict_proba(t_data)[:, 1]
preds_df = pd.DataFrame({"ID": df_test["ID"], "is_bot": preds})
preds_df.to_csv("sub_cb_3.csv", index=False)