In [1]:
import json
import pandas as pd

with open('train.json') as f:
    train_json = json.load(f)

rows = []

for chat_id, messages in train_json.items():
    for message in messages:
        rows.append({
            'chat_id': chat_id,
            'participant_index': message['participant_index'],
            'text': message['text']
        })

df = pd.DataFrame(rows)
grouped_df = df.groupby(['chat_id', 'participant_index'])['text'].apply(lambda x: ' '.join(x)).reset_index()
grouped_df.rename(columns={'text': 'combined_text'}, inplace=True)

grouped_df.head()

Unnamed: 0,chat_id,participant_index,combined_text
0,000c0bd4f6554034ae2a050b5d8f6dea,0,че как? жопа жопа/ жопич жопич
1,000c0bd4f6554034ae2a050b5d8f6dea,1,Жопка жопич
2,009952d565384a69b5ddf1ec76608391,0,"чио рио? да это отсылка, но почему именно на это"
3,009952d565384a69b5ddf1ec76608391,1,"""Чио Рио"" – это, вероятно, отсылка к популярно..."
4,00a66ccfbb7f42f39da2b641072d5ef6,0,I am OpenAI Assistant! опенэйаю ну то и имею


In [2]:
ytrain = pd.read_csv('ytrain.csv')

y_df = pd.DataFrame(ytrain)

y_df.head()

Unnamed: 0,dialog_id,participant_index,is_bot
0,dae9e2dae9f840549764f8d9bbbb80f0,0,0
1,159da0d7937c4c1e84a51f0df7e3ade6,0,0
2,1aed86f082234446951360d00979f0d9,0,0
3,ab3dbd121828403ba30d0ed4008fbea4,0,0
4,08ce7e4224cc411ba629f1983eba031f,0,1


In [3]:
grouped_df["chat_id"] = grouped_df["chat_id"].astype(str)
grouped_df["participant_index"] = grouped_df["participant_index"].astype(str)

y_df["dialog_id"] = y_df["dialog_id"].astype(str)
y_df["participant_index"] = y_df["participant_index"].astype(str)

y_df_renamed = y_df.rename(columns={"dialog_id": "chat_id"})

merged_df = grouped_df.merge(
    y_df_renamed,
    on=["chat_id", "participant_index"],
    how="inner"
)

merged_df.head()

Unnamed: 0,chat_id,participant_index,combined_text,is_bot
0,000c0bd4f6554034ae2a050b5d8f6dea,0,че как? жопа жопа/ жопич жопич,1
1,000c0bd4f6554034ae2a050b5d8f6dea,1,Жопка жопич,0
2,009952d565384a69b5ddf1ec76608391,0,"чио рио? да это отсылка, но почему именно на это",0
3,009952d565384a69b5ddf1ec76608391,1,"""Чио Рио"" – это, вероятно, отсылка к популярно...",1
4,00a66ccfbb7f42f39da2b641072d5ef6,0,I am OpenAI Assistant! опенэйаю ну то и имею,0


In [4]:
from datasets import Dataset
import pandas as pd

merged_df['combined_text'] = merged_df['combined_text'].astype(str)
merged_df['is_bot'] = merged_df['is_bot'].astype(int)

dataset = Dataset.from_pandas(merged_df[['combined_text', 'is_bot']])

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(example):
    return tokenizer(example['combined_text'], truncation=True, padding='max_length', max_length=128)

tokenized_dataset = dataset.map(preprocess, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("is_bot", "labels")
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map: 100%|██████████| 1572/1572 [00:00<00:00, 2161.45 examples/s]


In [6]:
from transformers import AutoModelForSequenceClassification
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [7]:
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)

lora_config_dict = {
    'r': 8,
    'lora_alpha': 16,
    'target_modules': ["q_lin", "v_lin"],
    'lora_dropout': 0.1,
    'bias': "none",
    'task_type': TaskType.SEQ_CLS
}

lora_config = LoraConfig(**lora_config_dict)

model = get_peft_model(base_model, lora_config)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization are unavailable.


In [8]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./lora-bot-detector",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=20,
    fp16=False,
    report_to="none"
)

split = tokenized_dataset.train_test_split(test_size=0.2)

train_samples = len(split["train"])
val_samples = len(split["test"])

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split["train"],
    eval_dataset=split["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [9]:
import mlflow
import mlflow.pytorch
import numpy as np

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("bot-detector-experiment")

run = mlflow.start_run()
run_id = run.info.run_id

mlflow.log_param("train_samples", train_samples)
mlflow.log_param("val_samples", val_samples)
mlflow.log_param("base_model", model_name)
mlflow.log_param("tokenizer", model_name)

for key, value in lora_config_dict.items():
    mlflow.log_param(f"lora_{key}", str(value) if key == "task_type" else value)

mlflow.end_run()

🏃 View run nebulous-sow-877 at: http://localhost:5000/#/experiments/1/runs/4df4a44fdbd64caeb9a43fc7779cc069
🧪 View experiment at: http://localhost:5000/#/experiments/1


In [10]:
trainer.train()



Step,Training Loss
20,0.6171
40,0.5665
60,0.6985
80,0.5996
100,0.6123
120,0.6346
140,0.6742
160,0.598
180,0.6145
200,0.6199


TrainOutput(global_step=315, training_loss=0.6165945507231213, metrics={'train_runtime': 372.4123, 'train_samples_per_second': 3.375, 'train_steps_per_second': 0.846, 'total_flos': 42341858601984.0, 'train_loss': 0.6165945507231213, 'epoch': 1.0})

In [11]:
mlflow.start_run(run_id=run_id)

train_metrics = trainer.evaluate(split["train"])
val_metrics = trainer.evaluate(split["test"])

for key, value in train_metrics.items():
    mlflow.log_metric(f"train_{key}", value)

for key, value in val_metrics.items():
    mlflow.log_metric(f"val_{key}", value)

output_model_path = "./lora-bot-detector"
model.save_pretrained(output_model_path)
tokenizer.save_pretrained(output_model_path)

mlflow.pytorch.log_model(
    pytorch_model=model,
    artifact_path="lora_model",
    registered_model_name="lora-bot-detector"
)

mlflow.end_run()



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
Registered model 'lora-bot-detector' already exists. Creating a new version of this model...
2025/07/17 22:13:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lora-bot-detector, version 2


🏃 View run nebulous-sow-877 at: http://localhost:5000/#/experiments/1/runs/4df4a44fdbd64caeb9a43fc7779cc069
🧪 View experiment at: http://localhost:5000/#/experiments/1


Created version '2' of model 'lora-bot-detector'.


In [12]:
# Load model
# model_name = "lora-bot-detector"
# model_version = "latest"

# model_uri = f"models:/{model_name}/{model_version}"
# model = mlflow.pytorch.load_model(model_uri)