In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Suppress TensorFlow warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Transformers and PyTorch
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    DistilBertTokenizer, 
    DistilBertForSequenceClassification,
    get_linear_schedule_with_warmup
)
from tqdm.auto import tqdm

print('Libraries imported successfully!')
print(f'PyTorch version: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')

In [None]:
# Load competition data
train_df = pd.read_csv('/kaggle/input/mercor-ai-detection/train.csv')
test_df = pd.read_csv('/kaggle/input/mercor-ai-detection/test.csv')
sample_sub = pd.read_csv('/kaggle/input/mercor-ai-detection/sample_submission.csv')

# Load large AI/Human dataset
dataset2 = pd.read_csv("/kaggle/input/ai-vs-human-text/AI_Human.csv")

print(f"Competition train shape: {train_df.shape}")
print(f"Competition test shape: {test_df.shape}")
print(f"AI/Human dataset shape: {dataset2.shape}")

print("\nAI/Human dataset preview:")
print(dataset2.head())
print("\nAI/Human label distribution:")
print(dataset2['generated'].value_counts())
print(dataset2['generated'].value_counts(normalize=True))

In [None]:
from sklearn.utils import resample

# Prepare large dataset
X_large = dataset2['text'].values
y_large = dataset2['generated'].values.astype(int)

print("="*60)
print("Original Dataset Distribution")
print("="*60)
print(f"Total samples: {len(X_large)}")
print(f"AI-generated: {sum(y_large)} ({sum(y_large)/len(y_large)*100:.2f}%)")
print(f"Human-written: {len(y_large)-sum(y_large)} ({(len(y_large)-sum(y_large))/len(y_large)*100:.2f}%)")

print("\n" + "="*60)
print("Balancing Dataset - Undersampling")
print("="*60)

# Separate by class
human_mask = y_large == 0
ai_mask = y_large == 1

X_human = X_large[human_mask]
X_ai = X_large[ai_mask]

print(f"\nBefore balancing:")
print(f"Human: {len(X_human)} samples")
print(f"AI: {len(X_ai)} samples")

# Undersample majority class (human) to match minority class (AI)
X_human_downsampled = resample(
    X_human,
    replace=False,
    n_samples=len(X_ai),
    random_state=42
)
y_human_downsampled = np.zeros(len(X_ai), dtype=int)
y_ai = np.ones(len(X_ai), dtype=int)

# Combine balanced data
X_large = np.concatenate([X_human_downsampled, X_ai])
y_large = np.concatenate([y_human_downsampled, y_ai])

# Shuffle
shuffle_idx = np.random.RandomState(42).permutation(len(X_large))
X_large = X_large[shuffle_idx]
y_large = y_large[shuffle_idx]

print(f"\nAfter balancing:")
print(f"Total samples: {len(X_large)}")
print(f"Human: {sum(y_large == 0)} ({sum(y_large == 0)/len(y_large)*100:.2f}%)")
print(f"AI: {sum(y_large == 1)} ({sum(y_large == 1)/len(y_large)*100:.2f}%)")
print("="*60)

In [None]:
# Load tokenizer
model_path = r"/kaggle/input/distillbert-base-uncased/transformers/default/1/distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
print(f"Tokenizer loaded from {model_path}")

In [None]:
class CheatingDetectionDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        
        return item

print("Dataset class created successfully!")

In [None]:
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    predictions = []
    true_labels = []
    
    progress_bar = tqdm(dataloader, desc='Training')
    for batch in progress_bar:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        logits = outputs.logits
        
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        preds = torch.softmax(logits, dim=1)[:, 1].detach().cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())
        
        progress_bar.set_postfix({'loss': loss.item()})
    
    avg_loss = total_loss / len(dataloader)
    auc = roc_auc_score(true_labels, predictions)
    
    return avg_loss, auc, predictions

def eval_model(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            if 'labels' in batch:
                labels = batch['labels'].to(device)
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                total_loss += outputs.loss.item()
                true_labels.extend(labels.cpu().numpy())
            else:
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )
            
            logits = outputs.logits
            preds = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
            predictions.extend(preds)
    
    if true_labels:
        avg_loss = total_loss / len(dataloader)
        auc = roc_auc_score(true_labels, predictions)
        return predictions, auc, avg_loss
    return predictions, None, None

print("Training and evaluation functions created successfully!")

In [None]:
# Training configuration
MAX_LENGTH = 512
BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 2e-5
VAL_SPLIT = 0.1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# X_large and y_large are already prepared and balanced from previous cell
print(f"\nUsing balanced dataset:")
print(f"Total samples: {len(X_large)}")
print(f"Human: {sum(y_large == 0)} ({sum(y_large == 0)/len(y_large)*100:.2f}%)")
print(f"AI: {sum(y_large == 1)} ({sum(y_large == 1)/len(y_large)*100:.2f}%)")

# Split large dataset
X_train_large, X_val_large, y_train_large, y_val_large = train_test_split(
    X_large, y_large, test_size=VAL_SPLIT, random_state=42, stratify=y_large
)

print(f"\nTrain size: {len(X_train_large)}")
print(f"Val size: {len(X_val_large)}")

# Create datasets
train_dataset = CheatingDetectionDataset(X_train_large, y_train_large, tokenizer, MAX_LENGTH)
val_dataset = CheatingDetectionDataset(X_val_large, y_val_large, tokenizer, MAX_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# Initialize model
model = DistilBertForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2
)
model.to(device)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

# Training
print("\n" + "="*60)
print("STAGE 1: Training on Large Balanced AI/Human Dataset")
print("="*60)

best_val_loss = float('inf')
best_val_auc = 0

for epoch in range(EPOCHS):
    print(f'\n--- Epoch {epoch + 1}/{EPOCHS} ---')
    
    train_loss, train_auc, _ = train_epoch(model, train_loader, optimizer, scheduler, device)
    val_preds, val_auc, val_loss = eval_model(model, val_loader, device)
    
    print(f'Train Loss: {train_loss:.4f}, Train AUC: {train_auc:.4f}')
    print(f'Val Loss: {val_loss:.4f}, Val AUC: {val_auc:.4f}')
    
    if val_auc > best_val_auc:
        best_val_auc = val_auc
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model_stage1.pth')
        print(f'✓ Model saved! Best Val AUC: {best_val_auc:.4f}')

print(f"\n{'='*60}")
print(f"Stage 1 Complete - Best Val AUC: {best_val_auc:.4f}")
print(f"{'='*60}")

# Load best model
model.load_state_dict(torch.load('best_model_stage1.pth'))
print("Loaded best model from Stage 1")

In [None]:
# Plot prediction distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(oof_predictions[y == 0], bins=30, alpha=0.5, label='Not Cheating')
plt.hist(oof_predictions[y == 1], bins=30, alpha=0.5, label='Cheating')
plt.xlabel('Prediction Probability')
plt.ylabel('Count')
plt.title('OOF Predictions Distribution')
plt.legend()

plt.subplot(1, 2, 2)
plt.hist(test_predictions, bins=30, alpha=0.7)
plt.xlabel('Prediction Probability')
plt.ylabel('Count')
plt.title('Test Predictions Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Create submission file
submission = pd.DataFrame()
submission['id'] = test_df['id']
submission['is_cheating'] = test_predictions

# Check submission format
print("Submission shape:", submission.shape)
print("\nSubmission preview:")
print(submission.head(10))
print("\nPrediction statistics:")
print(submission['is_cheating'].describe())

# Save submission
submission.to_csv('submission50.csv', index=False)
print("\nSubmission saved to 'submission50.csv'")

In [None]:
# Classification metrics at threshold 0.5
threshold = 0.5
oof_binary = (oof_predictions > threshold).astype(int)

print("Classification Report (OOF Predictions):")
print("="*50)
print(classification_report(y, oof_binary, target_names=['Not Cheating', 'Cheating']))

# Confusion Matrix
cm = confusion_matrix(y, oof_binary)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix (OOF Predictions)')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

print(f"\nFinal ROC-AUC Score: {overall_auc:.4f}")

In [None]:
dataset2 = pd.read_csv("/kaggle/input/ai-vs-human-text/AI_Human.csv")
print(f"Training set shape: {dataset2.shape}")

# Display first few rows
dataset2.head()