In [5]:
import pandas as pd
import numpy as np
import json
import torch
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier, Pool, cv
from matplotlib import pyplot as plt

from transformers import AutoModelForCausalLM, AutoTokenizer

tqdm.pandas()

%matplotlib inline

In [6]:
DEVICE = "cuda"
PERPLEXITY_MODELS = [
    "openai/whisper-tiny",
    "gpt2",
    "cointegrated/rubert-tiny2",
    "microsoft/phi-1.5",
]

In [7]:
def get_encodings(text, tokenizer, model):
    return tokenizer(
        text
        if text
        else tokenizer.decode(model.config.eos_token_id)
        if model.config.eos_token_id is not None
        else " ",
        return_tensors="pt",
        truncation=True,
        max_length=448
    )

In [12]:
def calculate_perplexity(text, context, model, tokenizer, device):
    # Токенизация
    encodings_text = get_encodings(" ".join([context, text]), tokenizer, model)
    encodings_context = get_encodings(context, tokenizer, model)
    tok_limit = 448 - encodings_text["attention_mask"].shape[-1]
    encodings = {
        "input_ids": torch.concat(
            (encodings_context["input_ids"][:, -tok_limit:], encodings_text["input_ids"]), axis=1
        ).to(device) if tok_limit else encodings_text["input_ids"].to(device),
        "attention_mask": torch.concat(
            (
                torch.zeros_like(encodings_context["attention_mask"][:, -tok_limit:]),
                encodings_text["attention_mask"]
            ), axis=1
        ).to(device) if tok_limit else encodings_text["attention_mask"].to(device),
    }

    # Вычисление mean negative log-likelihood (nll) per token
    # (стандартный loss для LLM в библиотеке transformers)
    input_ids = encodings["input_ids"]
    # print(encodings)
    with torch.no_grad():
        outputs = model(**encodings, labels=input_ids)
        neg_log_likelihood = outputs.loss.item()

    # return neg_log_likelihood
    perplexity = np.exp(neg_log_likelihood) # Перплексия вычисляется как e ^ nll
    return perplexity

In [9]:
train_ = pd.DataFrame([
    {
        "dialog_id": dialog_id,
        "message": data["message"],
        "context": " ".join([
            data["text"]
            for data in lines[:data["message"]]
        ]),
        "text": data["text"],
        "participant_index": int(data["participant_index"]),
    }
    for dialog_id, lines in json.load(open("/kaggle/input/you-are-bot/train.json", "r", encoding="utf-8")).items()
    for data in lines
])
labels_df = pd.read_csv("/kaggle/input/you-are-bot/ytrain.csv")
df = train_.merge(labels_df, on=("dialog_id", "participant_index"))
df.head()

Unnamed: 0,dialog_id,message,context,text,participant_index,is_bot
0,dae9e2dae9f840549764f8d9bbbb80f0,0,,Hello!,0,0
1,dae9e2dae9f840549764f8d9bbbb80f0,1,Hello!,Как дела?,1,0
2,dae9e2dae9f840549764f8d9bbbb80f0,2,Hello! Как дела?,Отлично! А твои?,0,0
3,dae9e2dae9f840549764f8d9bbbb80f0,3,Hello! Как дела? Отлично! А твои?,Это круто!,1,0
4,dae9e2dae9f840549764f8d9bbbb80f0,4,Hello! Как дела? Отлично! А твои? Это круто!,Расскажи теорему,0,0


In [10]:
test_ = pd.DataFrame([
    {
        "dialog_id": dialog_id,
        "message": data["message"],
        "context": " ".join([
            data["text"]
            for data in lines[:data["message"]]
        ]),
        "text": data["text"],
        "participant_index": int(data["participant_index"]),
    }
    for dialog_id, lines in json.load(open("/kaggle/input/you-are-bot/test.json", "r", encoding="utf-8")).items()
    for data in lines
])
df_info = pd.read_csv("/kaggle/input/you-are-bot/ytest.csv")
df_test = test_.merge(df_info, on=("dialog_id", "participant_index"))
df_test.head()

Unnamed: 0,dialog_id,message,context,text,participant_index,ID
0,af36ac2aa9734738bbd533db8e5fb43a,0,,Привет,0,af36ac2aa9734738bbd533db8e5fb43a_0
1,af36ac2aa9734738bbd533db8e5fb43a,1,Привет,кто ты?,1,af36ac2aa9734738bbd533db8e5fb43a_1
2,af36ac2aa9734738bbd533db8e5fb43a,2,Привет кто ты?,привет,0,af36ac2aa9734738bbd533db8e5fb43a_0
3,af36ac2aa9734738bbd533db8e5fb43a,3,Привет кто ты? привет,Я - виртуальный помощник,1,af36ac2aa9734738bbd533db8e5fb43a_1
4,af36ac2aa9734738bbd533db8e5fb43a,4,Привет кто ты? привет Я - виртуальный помощник,ты бот?,0,af36ac2aa9734738bbd533db8e5fb43a_0


In [13]:
for model_name in PERPLEXITY_MODELS:
    print(f"Loading `{model_name}` model...")
    model_p = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        trust_remote_code=True,
        output_hidden_states=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True
    )
    model_p = model_p.to(DEVICE)
    print("Calculating perplexities for train dataset...")
    df[f"{model_name.split('/')[-1]}-perplexity"] = (
        df[["context","text"]]
        .progress_apply(
            lambda x: calculate_perplexity(x["text"], x["context"], model_p, tokenizer, DEVICE),
            axis=1
        )
    )
    print("Calculating perplexities for test dataset...")
    df_test[f"{model_name.split('/')[-1]}-perplexity"] = (
        df_test[["context","text"]]
        .progress_apply(
            lambda x: calculate_perplexity(x["text"], x["context"], model_p, tokenizer, DEVICE),
            axis=1
        )
    )
    print("Finished!")

Loading `openai/whisper-tiny` model...




Calculating perplexities for train dataset...


100%|██████████| 6334/6334 [00:35<00:00, 179.58it/s]


Calculating perplexities for test dataset...


100%|██████████| 2955/2955 [00:16<00:00, 176.23it/s]


Finished!
Loading `gpt2` model...


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Calculating perplexities for train dataset...


  0%|          | 0/6334 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
100%|██████████| 6334/6334 [02:09<00:00, 49.06it/s]


Calculating perplexities for test dataset...


100%|██████████| 2955/2955 [01:02<00:00, 47.42it/s]


Finished!
Loading `cointegrated/rubert-tiny2` model...


config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Calculating perplexities for train dataset...


100%|██████████| 6334/6334 [00:33<00:00, 187.96it/s]


Calculating perplexities for test dataset...


100%|██████████| 2955/2955 [00:16<00:00, 182.94it/s]


Finished!
Loading `microsoft/phi-1.5` model...


config.json:   0%|          | 0.00/736 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.84G [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Calculating perplexities for train dataset...


100%|██████████| 6334/6334 [04:39<00:00, 22.69it/s]


Calculating perplexities for test dataset...


100%|██████████| 2955/2955 [02:13<00:00, 22.11it/s]

Finished!





In [14]:
df.head()

Unnamed: 0,dialog_id,message,context,text,participant_index,is_bot,whisper-tiny-perplexity,gpt2-perplexity,rubert-tiny2-perplexity,phi-1.5-perplexity
0,dae9e2dae9f840549764f8d9bbbb80f0,0,,Hello!,0,0,5410.852,51346.526899,9933823.0,5205.352307
1,dae9e2dae9f840549764f8d9bbbb80f0,1,Hello!,Как дела?,1,0,115376.5,2448.170511,1801371.0,19.199167
2,dae9e2dae9f840549764f8d9bbbb80f0,2,Hello! Как дела?,Отлично! А твои?,0,0,314261.0,356.356368,930047.1,9.206183
3,dae9e2dae9f840549764f8d9bbbb80f0,3,Hello! Как дела? Отлично! А твои?,Это круто!,1,0,737716.1,206.758719,745986.5,6.775529
4,dae9e2dae9f840549764f8d9bbbb80f0,4,Hello! Как дела? Отлично! А твои? Это круто!,Расскажи теорему,0,0,1100427.0,180.093679,1543898.0,7.237918


In [15]:
df.merge(df_test, on="dialog_id")

Unnamed: 0,dialog_id,message_x,context_x,text_x,participant_index_x,is_bot,whisper-tiny-perplexity_x,gpt2-perplexity_x,rubert-tiny2-perplexity_x,phi-1.5-perplexity_x,message_y,context_y,text_y,participant_index_y,ID,whisper-tiny-perplexity_y,gpt2-perplexity_y,rubert-tiny2-perplexity_y,phi-1.5-perplexity_y


In [34]:
perplexity_cols = [m.split('/')[-1] + "-perplexity" for m in PERPLEXITY_MODELS]
feature_cols = ["message", "context", "text", "participant_index"] + perplexity_cols
label_col = "is_bot"

train_df, val_df = train_test_split(
    df[feature_cols + [label_col]],
    test_size=0.2,
    random_state=42,
    stratify=df[label_col]
)

train_data = Pool(
    train_df[feature_cols],
    train_df[label_col],
    text_features=["context", "text"]
)
val_data = Pool(
    val_df[feature_cols],
    val_df[label_col],
    text_features=["context", "text"]
)

clf = CatBoostClassifier(random_seed=42, task_type="GPU", loss_function="Logloss")
clf.fit(train_data, eval_set=val_data, silent=None, verbose=1)

Learning rate set to 0.062033
0:	learn: 0.6648989	test: 0.6649683	best: 0.6649683 (0)	total: 52.7ms	remaining: 52.6s
1:	learn: 0.6403470	test: 0.6408375	best: 0.6408375 (1)	total: 109ms	remaining: 54.2s
2:	learn: 0.6213700	test: 0.6210654	best: 0.6210654 (2)	total: 160ms	remaining: 53.3s
3:	learn: 0.6038663	test: 0.6036774	best: 0.6036774 (3)	total: 217ms	remaining: 54s
4:	learn: 0.5886003	test: 0.5878615	best: 0.5878615 (4)	total: 266ms	remaining: 52.9s
5:	learn: 0.5752642	test: 0.5747059	best: 0.5747059 (5)	total: 302ms	remaining: 50.1s
6:	learn: 0.5618929	test: 0.5613200	best: 0.5613200 (6)	total: 338ms	remaining: 48s
7:	learn: 0.5516007	test: 0.5506693	best: 0.5506693 (7)	total: 375ms	remaining: 46.5s
8:	learn: 0.5424275	test: 0.5415107	best: 0.5415107 (8)	total: 407ms	remaining: 44.8s
9:	learn: 0.5336611	test: 0.5330285	best: 0.5330285 (9)	total: 439ms	remaining: 43.5s
10:	learn: 0.5272743	test: 0.5266187	best: 0.5266187 (10)	total: 469ms	remaining: 42.1s
11:	learn: 0.5206095	test

<catboost.core.CatBoostClassifier at 0x7b41fc9a7590>

In [None]:
# 0.4693707088
# 0.4772935249
# 0.4686329397
# 0.4741760013
# 0.460232055
# 0.4651705303
# 0.381961958
# 0.4416268822

In [30]:
test_data = Pool(df_test[feature_cols], text_features=["context", "text"])
df_test["is_bot"] = clf.predict_proba(test_data)[:, 1]

In [31]:
df_test

Unnamed: 0,dialog_id,message,context,text,participant_index,ID,whisper-tiny-perplexity,gpt2-perplexity,rubert-tiny2-perplexity,phi-1.5-perplexity,is_bot
0,af36ac2aa9734738bbd533db8e5fb43a,0,,Привет,0,af36ac2aa9734738bbd533db8e5fb43a_0,6.175675e+03,1.030771e+04,3.051178e+07,23.100184,0.049864
1,af36ac2aa9734738bbd533db8e5fb43a,1,Привет,кто ты?,1,af36ac2aa9734738bbd533db8e5fb43a_1,5.797573e+05,5.655680e+02,6.851906e+05,8.699018,0.267910
2,af36ac2aa9734738bbd533db8e5fb43a,2,Привет кто ты?,привет,0,af36ac2aa9734738bbd533db8e5fb43a_0,1.000991e+06,1.721001e+02,4.207988e+05,5.067787,0.210222
3,af36ac2aa9734738bbd533db8e5fb43a,3,Привет кто ты? привет,Я - виртуальный помощник,1,af36ac2aa9734738bbd533db8e5fb43a_1,1.736655e+06,1.552665e+02,3.394531e+05,7.130683,0.345737
4,af36ac2aa9734738bbd533db8e5fb43a,4,Привет кто ты? привет Я - виртуальный помощник,ты бот?,0,af36ac2aa9734738bbd533db8e5fb43a_0,2.094762e+06,9.038600e+01,6.426753e+05,5.496197,0.093644
...,...,...,...,...,...,...,...,...,...,...,...
2950,56201a8ac9c64665aa6d236dbc79daf4,5,"Представь, что ты зашел в кафе и случайно услы...",Ну и отлично!,1,56201a8ac9c64665aa6d236dbc79daf4_1,2.086036e+05,4.030887e+00,1.146273e+06,3.236871,0.670011
2951,a1abc4a69bb84c11804ce189966d967d,0,,щ,0,a1abc4a69bb84c11804ce189966d967d_0,6.753298e+03,1.480271e+07,6.559050e+07,3090.655658,0.048383
2952,a1abc4a69bb84c11804ce189966d967d,1,щ,привет,1,a1abc4a69bb84c11804ce189966d967d_1,5.503270e+04,7.039658e+03,6.519813e+06,15.547140,0.263671
2953,a1abc4a69bb84c11804ce189966d967d,2,щ привет,🇲🇬,0,a1abc4a69bb84c11804ce189966d967d_0,1.924842e+05,6.633192e+02,6.626939e+06,11.434559,0.119524


In [33]:
(
    df_test
    .groupby("ID")
    .mean("is_bot")
    .reset_index()
    [["ID", "is_bot"]]
    .to_csv("final_submission.csv", index=False)
)