In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
from collections import Counter
from captum.attr import IntegratedGradients  # Для интерпретации

warnings.filterwarnings('ignore')

# Конфигурация
CONFIG = {
    'tfidf': {
        'max_features': 10000,
        'ngram_range': (1, 2)
    },
    'svd': {
        'n_components': 300
    },
    'model': {
        'hidden_dim': 256,  # Увеличено
        'dropout': 0.5,
        'bert_train_layers': 4,  # Сколько слоёв BERT будем обучать
        'lr': {
            'model': 1e-4,
            'bert': 2e-5
        }
    },
    'training': {
        'batch_size': 32,
        'epochs': 10,  # Увеличено
        'warmup_steps': 100,
        'val_size': 0.2,
        'random_state': 42,
        'n_folds': 5,
        'patience': 3  # Для ранней остановки
    },
    'logging': {
        'project': 'ai_text_detection',
        'enabled': True
    }
}

# ==================== МОДЕЛЬ ====================
class CrossModalAttention(nn.Module):
    def __init__(self, hidden_dim=256):
        super().__init__()
        self.query = nn.Linear(hidden_dim, hidden_dim)
        self.key = nn.Linear(hidden_dim, hidden_dim)
        self.value = nn.Linear(hidden_dim, hidden_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, tfidf_proj, bert_proj):
        q = self.query(bert_proj).unsqueeze(2)
        k = self.key(tfidf_proj).unsqueeze(1)
        v = self.value(tfidf_proj).unsqueeze(1)
        scores = torch.bmm(k, q) / (self.key.out_features ** 0.5)
        attention = self.softmax(scores)
        return torch.bmm(attention.transpose(1, 2), v).squeeze(1)

class HybridModel(nn.Module):
    def __init__(self, tfidf_dim, bert_dim, hidden_dim=256, dropout=0.5):
        super().__init__()
        self.tfidf_proj = nn.Sequential(
            nn.Linear(tfidf_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout))

        self.bert_proj = nn.Sequential(
            nn.Linear(bert_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout))

        self.cross_attn = CrossModalAttention(hidden_dim)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim//2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim//2, 1),
            nn.Sigmoid()
        )

    def forward(self, tfidf_features, bert_features):
        tfidf_proj = self.tfidf_proj(tfidf_features)
        bert_proj = self.bert_proj(bert_features)
        attended = self.cross_attn(tfidf_proj, bert_proj)
        return self.classifier(attended)

# ==================== ДАННЫЕ ====================
class TextDataset(Dataset):
    def __init__(self, texts, tfidf_vectorizer, svd, tokenizer, max_length=128, labels=None):
        self.texts = texts
        self.labels = labels
        self.tfidf_vectorizer = tfidf_vectorizer
        self.svd = svd
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tfidf_features = self.tfidf_vectorizer.transform([text]).toarray()[0]
        tfidf_features = self.svd.transform(tfidf_features.reshape(1, -1))[0]

        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        item = {
            'tfidf_features': torch.FloatTensor(tfidf_features),
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0)
        }

        if self.labels is not None:
            item['labels'] = torch.FloatTensor([self.labels[idx]])

        return item

# ==================== УТИЛИТЫ ====================
def analyze_data(train, test):
    """Анализ данных без утечек"""
    print("\n=== АНАЛИЗ ДАННЫХ ===")

    # Анализ длины текстов
    train['length'] = train['text'].apply(len)
    plt.figure(figsize=(10, 4))
    sns.boxplot(x='label', y='length', data=train)
    plt.title("Распределение длины текстов по классам")
    plt.show()

    # Анализ дисбаланса классов
    class_dist = train['label'].value_counts(normalize=True)
    print(f"\nРаспределение классов:\n{class_dist}")

    return train

def evaluate_model(model, data_loader, bert_model, device):
    """Расширенная оценка модели"""
    model.eval()
    all_preds, all_labels, all_probs = [], [], []

    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            bert_outputs = bert_model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask']
            )
            outputs = model(
                batch['tfidf_features'],
                bert_outputs.last_hidden_state[:, 0, :]
            )

            all_probs.extend(outputs.cpu().numpy().flatten())
            preds = (outputs > 0.5).float()
            all_preds.extend(preds.cpu().numpy().flatten())
            all_labels.extend(batch['labels'].cpu().numpy().flatten())

    metrics = {
        'auc': roc_auc_score(all_labels, all_probs),
        'f1': f1_score(all_labels, all_preds),
        'accuracy': accuracy_score(all_labels, all_preds),
        'report': classification_report(all_labels, all_preds)
    }

    # Матрица ошибок
    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.show()

    return metrics

def predict_test(model, test_loader, bert_model, device):
    """Предсказание на тестовых данных"""
    model.eval()
    test_preds = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Predicting on test'):
            batch = {k: v.to(device) for k, v in batch.items()}
            bert_outputs = bert_model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask']
            )
            outputs = model(
                batch['tfidf_features'],
                bert_outputs.last_hidden_state[:, 0, :]
            )
            test_preds.extend(outputs.cpu().numpy().flatten())

    return test_preds

# ==================== ОБУЧЕНИЕ ====================
def train_epoch(model, train_loader, optimizer, scheduler, bert_model, device, epoch):
    """Одна эпоха обучения"""
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}')

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        bert_outputs = bert_model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask']
        )
        outputs = model(
            batch['tfidf_features'],
            bert_outputs.last_hidden_state[:, 0, :]
        )
        loss = F.binary_cross_entropy(outputs, batch['labels'])
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})

    return total_loss / len(train_loader)

def train_and_validate():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Загрузка данных
    train = pd.read_csv("/kaggle/input/train-dataset/train_v2_drcat_02.csv")
    test = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")

    # Анализ данных (без утечек)
    train = analyze_data(train, test)

    # Подготовка тестового набора
    test_texts = test['text'].tolist()

    # Кросс-валидация
    skf = StratifiedKFold(n_splits=CONFIG['training']['n_folds'], shuffle=True,
                         random_state=CONFIG['training']['random_state'])

    fold_metrics = []
    test_preds = np.zeros(len(test))

    for fold, (train_idx, val_idx) in enumerate(skf.split(train['text'], train['label'])):
        print(f"\n=== Fold {fold + 1}/{CONFIG['training']['n_folds']} ===")

        # Разделение данных
        train_df, val_df = train.iloc[train_idx], train.iloc[val_idx]

        # Инициализация TF-IDF и SVD для каждого фолда (избегаем утечки)
        tfidf_vectorizer = TfidfVectorizer(**CONFIG['tfidf'])
        tfidf_matrix = tfidf_vectorizer.fit_transform(train_df['text'])
        svd = TruncatedSVD(**CONFIG['svd']).fit(tfidf_matrix)

        # Даталоадеры
        train_loader = DataLoader(
            TextDataset(train_df['text'].tolist(), tfidf_vectorizer, svd,
                       AutoTokenizer.from_pretrained("/kaggle/input/bert-base-uncased"),
                       labels=train_df['label'].tolist()),
            batch_size=CONFIG['training']['batch_size'],
            shuffle=True
        )

        val_loader = DataLoader(
            TextDataset(val_df['text'].tolist(), tfidf_vectorizer, svd,
                       AutoTokenizer.from_pretrained("/kaggle/input/bert-base-uncased"),
                       labels=val_df['label'].tolist()),
            batch_size=CONFIG['training']['batch_size']
        )

        # Тестовый даталоадер (используем текущий tfidf/svd)
        test_loader = DataLoader(
            TextDataset(test_texts, tfidf_vectorizer, svd,
                       AutoTokenizer.from_pretrained("/kaggle/input/bert-base-uncased")),
            batch_size=CONFIG['training']['batch_size']
        )

        # Инициализация моделей
        bert_model = AutoModel.from_pretrained("/kaggle/input/bert-base-uncased").to(device)

        # Замораживаем все слои BERT, кроме последних N
        for param in bert_model.parameters():
            param.requires_grad = False

        for layer in bert_model.encoder.layer[-CONFIG['model']['bert_train_layers']:]:
            for param in layer.parameters():
                param.requires_grad = True

        model = HybridModel(
            tfidf_dim=CONFIG['svd']['n_components'],
            bert_dim=768,
            hidden_dim=CONFIG['model']['hidden_dim'],
            dropout=CONFIG['model']['dropout']
        ).to(device)

        optimizer = torch.optim.AdamW([
            {'params': model.parameters(), 'lr': CONFIG['model']['lr']['model']},
            {'params': bert_model.parameters(), 'lr': CONFIG['model']['lr']['bert']}
        ])

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=CONFIG['training']['warmup_steps'],
            num_training_steps=len(train_loader) * CONFIG['training']['epochs']
        )

        # Ранняя остановка
        best_auc = 0
        patience_counter = 0

        # Обучение
        for epoch in range(CONFIG['training']['epochs']):
            train_loss = train_epoch(model, train_loader, optimizer, scheduler, bert_model, device, epoch)

            # Валидация
            val_metrics = evaluate_model(model, val_loader, bert_model, device)
            print(f"Epoch {epoch + 1} | Train Loss: {train_loss:.4f} | Val AUC: {val_metrics['auc']:.4f}")
            print(val_metrics['report'])

            # Ранняя остановка
            if val_metrics['auc'] > best_auc:
                best_auc = val_metrics['auc']
                patience_counter = 0
                torch.save(model.state_dict(), f'best_model_fold{fold}.pt')
            else:
                patience_counter += 1
                if patience_counter >= CONFIG['training']['patience']:
                    print(f"Early stopping at epoch {epoch + 1}")
                    break

        # Загрузка лучшей модели для фолда
        model.load_state_dict(torch.load(f'best_model_fold{fold}.pt'))

        # Предсказание на тесте
        fold_test_preds = predict_test(model, test_loader, bert_model, device)
        test_preds += np.array(fold_test_preds) / CONFIG['training']['n_folds']

        # Сохранение метрик фолда
        fold_metrics.append(best_auc)

    # Финализация результатов
    print("\n=== ИТОГОВЫЕ МЕТРИКИ ===")
    print(f"Средний AUC по фолдам: {np.mean(fold_metrics):.4f} (±{np.std(fold_metrics):.4f})")
    print(f"Лучший AUC: {np.max(fold_metrics):.4f}")

    # Сохранение предсказаний
    submission = pd.DataFrame({
        'id': test['id'],
        'generated': test_preds
    })
    submission.to_csv('submission.csv', index=False)
    print("\nПредсказания сохранены в submission.csv")

if __name__ == "__main__":
    train_and_validate()