In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import warnings

# Отключаем предупреждения
warnings.filterwarnings('ignore')

# ==================== КОНФИГУРАЦИЯ ====================
CONFIG = {
    'data': {
        'train_path': "/kaggle/input/train-dataset/train_v2_drcat_02.csv",
        'test_path': "/kaggle/input/llm-detect-ai-generated-text/test_essays.csv",
        'bert_path': "/kaggle/input/bert-base-uncased",
        'max_length': 128,
        'val_size': 0.2,
        'random_state': 42,
        'n_folds': 5
    },
    'tfidf': {
        'max_features': 10000,
        'ngram_range': (1, 2)
    },
    'svd': {
        'n_components': 300
    },
    'model': {
        'hidden_dim': 128,
        'dropout': 0.3,
        'lr': {
            'model': 1e-4,
            'bert': 2e-5
        }
    },
    'training': {
        'batch_size': 32,
        'epochs': 5,
        'warmup_steps': 100,
        'patience': 2
    }
}

# ==================== 1. ПОДГОТОВКА И АНАЛИЗ ДАННЫХ ====================
def clean_text(text):
    """Очистка текста от артефактов"""
    # Удаление специфичных маркеров
    text = re.sub(r'\[(ai|generated|model).*?\]', '', text, flags=re.IGNORECASE)
    # Удаление повторяющихся пробелов
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def detect_artifacts(texts, labels, n=3):
    """Поиск характерных n-грамм"""
    ai_ngrams = defaultdict(int)
    human_ngrams = defaultdict(int)

    for text, label in zip(texts, labels):
        words = text.lower().split()
        for i in range(len(words)-n+1):
            ngram = ' '.join(words[i:i+n])
            if label == 1:
                ai_ngrams[ngram] += 1
            else:
                human_ngrams[ngram] += 1

    # Находим уникальные для AI n-граммы
    unique_ai = {k: v for k, v in ai_ngrams.items() if k not in human_ngrams}
    return sorted(unique_ai.items(), key=lambda x: -x[1])[:20]

def analyze_dataset(train_df, test_df):
    """Полный анализ данных"""
    print("\n=== АНАЛИЗ ДАННЫХ ===")

    # 1. Проверка распределения меток
    print("\nРаспределение меток в train:")
    print(train_df['label'].value_counts(normalize=True))

    # 2. Поиск артефактов
    artifacts = detect_artifacts(train_df['text'], train_df['label'])
    print("\nТоп-20 уникальных AI-паттернов:")
    for ngram, count in artifacts:
        print(f"{ngram}: {count}")

    # 3. Проверка пересечений train/test
    train_texts = set(train_df['text'].str.lower())
    test_texts = set(test_df['text'].str.lower())
    print(f"\nПересекающихся текстов: {len(train_texts & test_texts)}")

    # 4. Визуализация длины текстов
    train_df['length'] = train_df['text'].apply(len)
    plt.figure(figsize=(10, 4))
    sns.boxplot(x='label', y='length', data=train_df)
    plt.title("Распределение длины текстов")
    plt.show()

# ==================== 2. ПОДГОТОВКА МОДЕЛИ ====================
class HybridModel(nn.Module):
    """Гибридная модель с кросс-модальным вниманием"""
    def __init__(self, tfidf_dim, bert_dim, hidden_dim=128, dropout=0.3):
        super().__init__()
        # Проекции для признаков
        self.tfidf_proj = nn.Sequential(
            nn.Linear(tfidf_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout)
        )
        self.bert_proj = nn.Sequential(
            nn.Linear(bert_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout))

        # Механизм внимания
        self.attention = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, 1),
            nn.Softmax(dim=1))

        # Классификатор
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim//2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim//2, 1),
            nn.Sigmoid())

    def forward(self, tfidf_features, bert_features):
        # Проекции признаков
        tfidf_proj = self.tfidf_proj(tfidf_features)
        bert_proj = self.bert_proj(bert_features)

        # Внимание
        combined = torch.cat([tfidf_proj.unsqueeze(1), bert_proj.unsqueeze(1)], dim=1)
        attn_weights = self.attention(combined)
        attended = (combined * attn_weights).sum(dim=1)

        return self.classifier(attended)

# ==================== 3. ОБУЧЕНИЕ И ВАЛИДАЦИЯ ====================
def train_and_validate():
    # Инициализация
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Загрузка данных
    train = pd.read_csv(CONFIG['data']['train_path'])
    test = pd.read_csv(CONFIG['data']['test_path'])

    # Очистка данных
    train['text'] = train['text'].apply(clean_text)
    test['text'] = test['text'].apply(clean_text)

    # Анализ данных
    analyze_dataset(train, test)

    # Подготовка TF-IDF
    tfidf = TfidfVectorizer(
        max_features=CONFIG['tfidf']['max_features'],
        ngram_range=CONFIG['tfidf']['ngram_range'])
    svd = TruncatedSVD(n_components=CONFIG['svd']['n_components'])

    # Кросс-валидация
    skf = StratifiedKFold(
        n_splits=CONFIG['data']['n_folds'],
        shuffle=True,
        random_state=CONFIG['data']['random_state'])

    fold_metrics = []
    test_preds = np.zeros(len(test))

    for fold, (train_idx, val_idx) in enumerate(skf.split(train['text'], train['label'])):
        print(f"\n=== Fold {fold+1}/{CONFIG['data']['n_folds']} ===")

        # Разделение данных
        train_df, val_df = train.iloc[train_idx], train.iloc[val_idx]

        # Обучение TF-IDF
        tfidf_matrix = tfidf.fit_transform(train_df['text'])
        svd.fit(tfidf_matrix)

        # Даталоадеры
        tokenizer = AutoTokenizer.from_pretrained(CONFIG['data']['bert_path'])

        train_dataset = TextDataset(
            train_df['text'].tolist(),
            tfidf, svd, tokenizer,
            max_length=CONFIG['data']['max_length'],
            labels=train_df['label'].tolist())

        val_dataset = TextDataset(
            val_df['text'].tolist(),
            tfidf, svd, tokenizer,
            max_length=CONFIG['data']['max_length'],
            labels=val_df['label'].tolist())

        test_dataset = TextDataset(
            test['text'].tolist(),
            tfidf, svd, tokenizer,
            max_length=CONFIG['data']['max_length'])

        train_loader = DataLoader(
            train_dataset,
            batch_size=CONFIG['training']['batch_size'],
            shuffle=True)

        val_loader = DataLoader(
            val_dataset,
            batch_size=CONFIG['training']['batch_size'])

        test_loader = DataLoader(
            test_dataset,
            batch_size=CONFIG['training']['batch_size'])

        # Инициализация моделей
        bert_model = AutoModel.from_pretrained(CONFIG['data']['bert_path']).to(device)
        model = HybridModel(
            tfidf_dim=CONFIG['svd']['n_components'],
            bert_dim=768,
            hidden_dim=CONFIG['model']['hidden_dim'],
            dropout=CONFIG['model']['dropout']).to(device)

        # Оптимизатор и шедулер
        optimizer = torch.optim.AdamW([
            {'params': model.parameters(), 'lr': CONFIG['model']['lr']['model']},
            {'params': bert_model.parameters(), 'lr': CONFIG['model']['lr']['bert']}
        ])

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=CONFIG['training']['warmup_steps'],
            num_training_steps=len(train_loader)*CONFIG['training']['epochs'])

        criterion = nn.BCELoss()

        # Обучение
        best_auc = 0
        patience = 0

        for epoch in range(CONFIG['training']['epochs']):
            model.train()
            bert_model.train()
            total_loss = 0

            for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
                batch = {k: v.to(device) for k, v in batch.items()}
                optimizer.zero_grad()

                with torch.set_grad_enabled(True):
                    bert_outputs = bert_model(
                        input_ids=batch['input_ids'],
                        attention_mask=batch['attention_mask'])

                    outputs = model(
                        batch['tfidf_features'],
                        bert_outputs.last_hidden_state[:, 0, :])

                    loss = criterion(outputs, batch['labels'])
                    loss.backward()
                    optimizer.step()
                    scheduler.step()

                    total_loss += loss.item()

            # Валидация
            model.eval()
            bert_model.eval()
            val_preds, val_labels = [], []

            with torch.no_grad():
                for batch in val_loader:
                    batch = {k: v.to(device) for k, v in batch.items()}
                    bert_outputs = bert_model(
                        input_ids=batch['input_ids'],
                        attention_mask=batch['attention_mask'])

                    outputs = model(
                        batch['tfidf_features'],
                        bert_outputs.last_hidden_state[:, 0, :])

                    val_preds.extend(outputs.cpu().numpy().flatten())
                    val_labels.extend(batch['labels'].cpu().numpy().flatten())

            auc = roc_auc_score(val_labels, val_preds)
            print(f"Epoch {epoch+1}: Loss = {total_loss/len(train_loader):.4f}, Val AUC = {auc:.4f}")

            # Ранняя остановка
            if auc > best_auc:
                best_auc = auc
                patience = 0
                torch.save(model.state_dict(), f'best_model_fold{fold}.pt')
            else:
                patience += 1
                if patience >= CONFIG['training']['patience']:
                    print(f"Early stopping at epoch {epoch+1}")
                    break

        # Сохранение метрик
        fold_metrics.append(best_auc)

        # Предсказание на тесте
        model.load_state_dict(torch.load(f'best_model_fold{fold}.pt'))
        model.eval()

        fold_test_preds = []
        with torch.no_grad():
            for batch in test_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                bert_outputs = bert_model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'])

                outputs = model(
                    batch['tfidf_features'],
                    bert_outputs.last_hidden_state[:, 0, :])

                fold_test_preds.extend(outputs.cpu().numpy().flatten())

        test_preds += np.array(fold_test_preds) / CONFIG['data']['n_folds']

    # Сохранение результатов
    submission = pd.DataFrame({
        'id': test['id'],
        'generated': test_preds
    })
    submission.to_csv('submission.csv', index=False)

    print("\n=== ИТОГОВЫЕ МЕТРИКИ ===")
    print(f"Средний AUC по фолдам: {np.mean(fold_metrics):.4f} (±{np.std(fold_metrics):.4f})")

if __name__ == "__main__":
    train_and_validate()