In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch.nn.functional as F
import matplotlib.pyplot as plt
import warnings

# Конфигурация
CONFIG = {
    'tfidf': {
        'max_features': 10000,
        'ngram_range': (1, 2)
    },
    'svd': {
        'n_components': 300
    },
    'model': {
        'hidden_dim': 256,
        'dropout': 0.5,
        'bert_train_layers': 4,
        'lr': {
            'model': 1e-4,
            'bert': 2e-5
        }
    },
    'training': {
        'batch_size': 32,
        'epochs': 10,
        'warmup_steps': 100,
        'val_size': 0.2,
        'random_state': 42,
        'n_folds': 5,
        'patience': 3       # Новый параметр
    }
}

class CrossModalAttention(nn.Module):
    def __init__(self, hidden_dim=256):
        super().__init__()
        self.query = nn.Linear(hidden_dim, hidden_dim)
        self.key = nn.Linear(hidden_dim, hidden_dim)
        self.value = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, tfidf_proj, bert_proj):
        q = self.query(bert_proj).unsqueeze(2)
        k = self.key(tfidf_proj).unsqueeze(1)
        v = self.value(tfidf_proj).unsqueeze(1)
        scores = torch.bmm(k, q) / (self.key.out_features ** 0.5)
        attention = F.softmax(scores, dim=-1)
        return torch.bmm(attention.transpose(1, 2), v).squeeze(1)

class HybridModel(nn.Module):
    def __init__(self, tfidf_dim, bert_dim, hidden_dim=256, dropout=0.5):
        super().__init__()
        self.tfidf_proj = nn.Sequential(
            nn.Linear(tfidf_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout))

        self.bert_proj = nn.Sequential(
            nn.Linear(bert_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout))

        self.cross_attn = CrossModalAttention(hidden_dim)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim//2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim//2, 1),
            nn.Sigmoid()
        )

    def forward(self, tfidf_features, bert_features):
        tfidf_proj = self.tfidf_proj(tfidf_features)
        bert_proj = self.bert_proj(bert_features)
        attended = self.cross_attn(tfidf_proj, bert_proj)
        return self.classifier(attended)

class TextDataset(Dataset):
    def __init__(self, texts, tfidf_vectorizer, svd, tokenizer, labels=None):
        self.texts = texts
        self.labels = labels
        self.tfidf_vectorizer = tfidf_vectorizer
        self.svd = svd
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tfidf_features = self.tfidf_vectorizer.transform([text]).toarray()[0]
        tfidf_features = self.svd.transform(tfidf_features.reshape(1, -1))[0]

        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )

        item = {
            'tfidf_features': torch.FloatTensor(tfidf_features),
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0)
        }

        if self.labels is not None:
            item['labels'] = torch.FloatTensor([self.labels[idx]])

        return item

def evaluate(model, val_loader, bert_model, device):
    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            bert_outputs = bert_model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask']
            )
            outputs = model(
                batch['tfidf_features'],
                bert_outputs.last_hidden_state[:, 0, :]
            )
            preds.extend(outputs.cpu().numpy().flatten())
            labels.extend(batch['labels'].cpu().numpy().flatten())

    return {
        'auc': roc_auc_score(labels, preds),
        'f1': f1_score(labels, (np.array(preds) > 0.5).astype(int)),
        'acc': accuracy_score(labels, (np.array(preds) > 0.5).astype(int))
    }

def train_fold(train_df, val_df, test_texts, fold, device):
    tfidf_vectorizer = TfidfVectorizer(**CONFIG['tfidf'])
    tfidf_matrix = tfidf_vectorizer.fit_transform(train_df['text'])
    svd = TruncatedSVD(**CONFIG['svd']).fit(tfidf_matrix)

    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    train_loader = DataLoader(
        TextDataset(train_df['text'].tolist(), tfidf_vectorizer, svd, tokenizer, train_df['label'].tolist()),
        batch_size=CONFIG['training']['batch_size'],
        shuffle=True
    )

    val_loader = DataLoader(
        TextDataset(val_df['text'].tolist(), tfidf_vectorizer, svd, tokenizer, val_df['label'].tolist()),
        batch_size=CONFIG['training']['batch_size']
    )

    test_loader = DataLoader(
        TextDataset(test_texts, tfidf_vectorizer, svd, tokenizer),
        batch_size=CONFIG['training']['batch_size']
    )

    bert_model = AutoModel.from_pretrained("bert-base-uncased").to(device)
    for param in bert_model.parameters():
        param.requires_grad = False
    for layer in bert_model.encoder.layer[-CONFIG['model']['bert_train_layers']:]:
        for param in layer.parameters():
            param.requires_grad = True

    model = HybridModel(
        tfidf_dim=CONFIG['svd']['n_components'],
        bert_dim=768,
        hidden_dim=CONFIG['model']['hidden_dim'],
        dropout=CONFIG['model']['dropout']
    ).to(device)

    optimizer = torch.optim.AdamW([
        {'params': model.parameters(), 'lr': CONFIG['model']['lr']['model']},
        {'params': bert_model.parameters(), 'lr': CONFIG['model']['lr']['bert']}
    ])

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=CONFIG['training']['warmup_steps'],
        num_training_steps=len(train_loader) * CONFIG['training']['epochs']
    )

    best_auc = 0
    for epoch in range(CONFIG['training']['epochs']):
        model.train()
        for batch in tqdm(train_loader, desc=f'Fold {fold} Epoch {epoch}'):
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()

            bert_outputs = bert_model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask']
            )
            outputs = model(
                batch['tfidf_features'],
                bert_outputs.last_hidden_state[:, 0, :]
            )
            loss = F.binary_cross_entropy(outputs, batch['labels'])
            loss.backward()
            optimizer.step()
            scheduler.step()

        val_metrics = evaluate(model, val_loader, bert_model, device)
        if val_metrics['auc'] > best_auc:
            best_auc = val_metrics['auc']
            torch.save(model.state_dict(), f'best_fold{fold}.pt')

    model.load_state_dict(torch.load(f'best_fold{fold}.pt'))
    test_preds = []
    with torch.no_grad():
        for batch in test_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            bert_outputs = bert_model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask']
            )
            outputs = model(
                batch['tfidf_features'],
                bert_outputs.last_hidden_state[:, 0, :]
            )
            test_preds.extend(outputs.cpu().numpy().flatten())

    return best_auc, np.array(test_preds)

def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    train = pd.read_csv("train.csv")
    test = pd.read_csv("test.csv")

    skf = StratifiedKFold(n_splits=CONFIG['training']['n_folds'], shuffle=True,
                         random_state=CONFIG['training']['random_state'])

    fold_aucs = []
    test_preds = np.zeros(len(test))

    for fold, (train_idx, val_idx) in enumerate(skf.split(train['text'], train['label'])):
        train_df, val_df = train.iloc[train_idx], train.iloc[val_idx]
        auc, preds = train_fold(train_df, val_df, test['text'].tolist(), fold, device)
        fold_aucs.append(auc)
        test_preds += preds / CONFIG['training']['n_folds']

    print(f"Mean AUC: {np.mean(fold_aucs):.4f}")
    pd.DataFrame({'id': test['id'], 'generated': test_preds}).to_csv('submission.csv', index=False)

if __name__ == "__main__":
    main()