In [1]:
import pandas as pd
import numpy as np
import json
import torch
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier, Pool, cv
from matplotlib import pyplot as plt

from transformers import AutoModelForCausalLM, AutoTokenizer

tqdm.pandas()

%matplotlib inline

In [65]:
DEVICE = "cuda"
PERPLEXITY_MODELS = [
    "openai/whisper-tiny",
    "gpt2",
    "cointegrated/rubert-tiny2",
    "microsoft/phi-1.5",
]

## Погдготовка датасета

In [None]:
train_ = pd.DataFrame([
    {
        "dialog_id": dialog_id,
        "message": data["message"],
        "text": data["text"],
        "participant_index": int(data["participant_index"]),
    }
    for dialog_id, lines in json.load(open("/kaggle/input/you-are-bot/train.json", "r", encoding="utf-8")).items()
    for data in lines
])
labels_df = pd.read_csv("/kaggle/input/you-are-bot/ytrain.csv")
df = train_.merge(labels_df, on=("dialog_id", "participant_index"))
df.head()

Unnamed: 0,dialog_id,message,text,participant_index,is_bot
0,dae9e2dae9f840549764f8d9bbbb80f0,0,Hello!,0,0
1,dae9e2dae9f840549764f8d9bbbb80f0,1,Как дела?,1,0
2,dae9e2dae9f840549764f8d9bbbb80f0,2,Отлично! А твои?,0,0
3,dae9e2dae9f840549764f8d9bbbb80f0,3,Это круто!,1,0
4,dae9e2dae9f840549764f8d9bbbb80f0,4,Расскажи теорему,0,0


In [None]:
def calculate_perplexity(text, model, tokenizer, device):
    # Токенизация
    encodings = tokenizer(
        text
        if text
        else tokenizer.decode(model.config.eos_token_id)
        if model.config.eos_token_id is not None
        else " ",
        return_tensors="pt",
        truncation=True,
        max_length=448
    )
    encodings = {k: v.to(device) for k, v in encodings.items()}

    # Вычисление mean negative log-likelihood (nll) per token
    # (стандартный loss для LLM в библиотеке transformers)
    input_ids = encodings["input_ids"]
    # print(encodings)
    with torch.no_grad():
        outputs = model(**encodings, labels=input_ids)
        neg_log_likelihood = outputs.loss.item()

    # return neg_log_likelihood
    perplexity = np.exp(neg_log_likelihood) # Перплексия вычисляется как e ^ nll
    return perplexity

In [None]:
test_ = pd.DataFrame([
    {
        "dialog_id": dialog_id,
        "message": data["message"],
        "text": data["text"],
        "participant_index": int(data["participant_index"]),
    }
    for dialog_id, lines in json.load(open("/kaggle/input/you-are-bot/test.json", "r", encoding="utf-8")).items()
    for data in lines
])
df_info = pd.read_csv("/kaggle/input/you-are-bot/ytest.csv")
df_test = test_.merge(df_info, on=("dialog_id", "participant_index"))
df_test.head()

Unnamed: 0,dialog_id,message,text,participant_index,ID
0,af36ac2aa9734738bbd533db8e5fb43a,0,Привет,0,af36ac2aa9734738bbd533db8e5fb43a_0
1,af36ac2aa9734738bbd533db8e5fb43a,1,кто ты?,1,af36ac2aa9734738bbd533db8e5fb43a_1
2,af36ac2aa9734738bbd533db8e5fb43a,2,привет,0,af36ac2aa9734738bbd533db8e5fb43a_0
3,af36ac2aa9734738bbd533db8e5fb43a,3,Я - виртуальный помощник,1,af36ac2aa9734738bbd533db8e5fb43a_1
4,af36ac2aa9734738bbd533db8e5fb43a,4,ты бот?,0,af36ac2aa9734738bbd533db8e5fb43a_0


### Добавим перплексии как фичи

In [64]:
for model_name in PERPLEXITY_MODELS:
    print(f"Loading `{model_name}` model...")
    model_p = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        trust_remote_code=True,
        output_hidden_states=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True
    )
    model_p = model_p.to(DEVICE)
    print("Calculating perplexities for train dataset...")
    df[f"{model_name.split('/')[-1]}-perplexity"] = df["text"].progress_apply(lambda x: calculate_perplexity(x, model_p, tokenizer, DEVICE))
    print("Calculating perplexities for test dataset...")
    df_test[f"{model_name.split('/')[-1]}-perplexity"] = df_test["text"].progress_apply(lambda x: calculate_perplexity(x, model_p, tokenizer, DEVICE))
    print("Finished!")

Loading `microsoft/phi-1.5` model...
Calculating perplexities for train dataset...


100%|██████████| 6334/6334 [02:48<00:00, 37.57it/s]


Calculating perplexities for test dataset...


100%|██████████| 2955/2955 [01:18<00:00, 37.81it/s]

Finished!





In [67]:
df.head()

Unnamed: 0,dialog_id,message,text,participant_index,is_bot,whisper-tiny-perplexity,gpt2-perplexity,rubert-tiny2-perplexity,phi-1.5-perplexity
0,dae9e2dae9f840549764f8d9bbbb80f0,0,Hello!,0,0,483218.1,179.017027,846903.1,27.317227
1,dae9e2dae9f840549764f8d9bbbb80f0,1,Как дела?,1,0,501610.8,42.499727,2480286.0,6.928153
2,dae9e2dae9f840549764f8d9bbbb80f0,2,Отлично! А твои?,0,0,1177822.0,29.308909,310961.0,25.466748
3,dae9e2dae9f840549764f8d9bbbb80f0,3,Это круто!,1,0,1429766.0,23.089811,1622778.0,12.437116
4,dae9e2dae9f840549764f8d9bbbb80f0,4,Расскажи теорему,0,0,15537120.0,13.540846,8956642.0,10.326161


## Обучение

### В качестве классификатора возьмем CatBoost

In [None]:
perplexity_cols = [m.split('/')[-1] + "-perplexity" for m in PERPLEXITY_MODELS]
feature_cols = ["message", "text", "participant_index"] + perplexity_cols
label_col = "is_bot"

train_df, val_df = train_test_split(
    df[feature_cols + [label_col]],
    test_size=0.2,
    random_state=42,
    stratify=df[label_col]
)

train_data = Pool(
    train_df[feature_cols],
    train_df[label_col],
    text_features=["text"]
)
val_data = Pool(
    val_df[feature_cols],
    val_df[label_col],
    text_features=["text"]
)

clf = CatBoostClassifier(random_seed=42, task_type="GPU", loss_function="Logloss")
clf.fit(train_data, eval_set=val_data, silent=None, verbose=1)



Learning rate set to 0.062033
0:	learn: 0.6695808	test: 0.6705196	best: 0.6705196 (0)	total: 50.9ms	remaining: 50.9s
1:	learn: 0.6498194	test: 0.6513705	best: 0.6513705 (1)	total: 102ms	remaining: 50.8s
2:	learn: 0.6347085	test: 0.6344358	best: 0.6344358 (2)	total: 151ms	remaining: 50.1s
3:	learn: 0.6178043	test: 0.6174567	best: 0.6174567 (3)	total: 201ms	remaining: 50s
4:	learn: 0.6038323	test: 0.6032705	best: 0.6032705 (4)	total: 250ms	remaining: 49.8s
5:	learn: 0.5912998	test: 0.5903220	best: 0.5903220 (5)	total: 300ms	remaining: 49.6s
6:	learn: 0.5808085	test: 0.5807219	best: 0.5807219 (6)	total: 331ms	remaining: 47s
7:	learn: 0.5723887	test: 0.5724112	best: 0.5724112 (7)	total: 363ms	remaining: 45s
8:	learn: 0.5641985	test: 0.5642079	best: 0.5642079 (8)	total: 395ms	remaining: 43.5s
9:	learn: 0.5563112	test: 0.5565663	best: 0.5565663 (9)	total: 427ms	remaining: 42.3s
10:	learn: 0.5507791	test: 0.5503420	best: 0.5503420 (10)	total: 459ms	remaining: 41.3s
11:	learn: 0.5458100	test: 

<catboost.core.CatBoostClassifier at 0x7f983a3c0490>

## Предсказание

In [None]:
test_data = Pool(df_test[feature_cols], text_features=["text"])
df_test["is_bot"] = clf.predict_proba(test_data)[:, 1]

In [None]:
(
    df_test
    .groupby("ID")
    .mean("is_bot")
    .reset_index()
    [["ID", "is_bot"]]
    .to_csv("submission.csv", index=False)
)

### SCORE: 0.447