In [None]:
# Установка необходимых библиотек
# !pip install transformers datasets evaluate scikit-learn matplotlib seaborn
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# !pip install transformers datasets evaluate
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


#pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118



Looking in indexes: https://download.pytorch.org/whl/cu118


In [6]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, r2_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from datasets import Dataset

sns.set(style='whitegrid')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Используемое устройство:", device)


Используемое устройство: cpu


In [None]:
# Загрузка и подготовка данных
df = pd.read_csv("processed_data_new.csv")
target_columns = [
    "Вопрос решен",
    "Нравится качество выполнения заявки",
    "Нравится качество работы сотрудников",
    "Нравится скорость отработки заявок",
    "Понравилось выполнение заявки",
    "Ничего из перечисленного"
]
text_column = "comment"
df[text_column] = df[text_column].fillna("")

X_train, X_test, y_train, y_test = train_test_split(df[text_column], df[target_columns], test_size=0.2, random_state=42)
train_df = pd.DataFrame({text_column: X_train, **{col: y_train[col].values for col in target_columns}})
test_df = pd.DataFrame({text_column: X_test, **{col: y_test[col].values for col in target_columns}})


NameError: name 'train_test_split' is not defined

In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(example[text_column], truncation=True)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(target_columns),
    problem_type="multi_label_classification"
)

def compute_metrics(p):
    preds = torch.sigmoid(torch.tensor(p.predictions)).numpy()
    y_true = p.label_ids
    aucs = [roc_auc_score(y_true[:, i], preds[:, i]) if len(np.unique(y_true[:, i])) > 1 else np.nan for i in range(len(target_columns))]
    r2 = r2_score(y_true, preds)
    return {
        "macro_roc_auc": np.nanmean(aucs),
        "r2_score": r2
    }

train_dataset = train_dataset.rename_columns({col: f"label_{i}" for i, col in enumerate(target_columns)})
test_dataset = test_dataset.rename_columns({col: f"label_{i}" for i, col in enumerate(target_columns)})

train_dataset = train_dataset.with_format("torch")
test_dataset = test_dataset.with_format("torch")

args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()

In [None]:
raw_pred, _, _ = trainer.predict(test_dataset)
probs = torch.sigmoid(torch.tensor(raw_pred)).numpy()
y_true = y_test[target_columns].values
y_pred_bin = (probs >= 0.5).astype(int)

# ROC-кривые
plt.figure(figsize=(12, 8))
roc_scores = {}
for i, label in enumerate(target_columns):
    if len(np.unique(y_true[:, i])) > 1:
        fpr, tpr, _ = roc_curve(y_true[:, i], probs[:, i])
        score = roc_auc_score(y_true[:, i], probs[:, i])
        roc_scores[label] = score
        plt.plot(fpr, tpr, label=f"{label} (AUC = {score:.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC-кривые по категориям")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# R2
print("\nR² (коэффициент детерминации):", r2_score(y_true, probs))
