In [1]:
import re
import unicodedata
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
from bs4 import BeautifulSoup
import spacy
from tqdm.notebook import tqdm
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import os
from transformers import AutoModel, AutoTokenizer, LongformerModel, LongformerTokenizer
from sklearn.decomposition import PCA
import random

def set_all_seeds(seed=42):
    """Set seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # For multi-GPU setups
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    print(f"All random seeds set to {seed}")

# Call this function at the beginning of your code
set_all_seeds(42)  # You can change 42 to any other seed value

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EMBED_DIM=30
# -------------------------------
# Setup SpaCy with GPU support if available
def setup_spacy():
    if spacy.prefer_gpu():
        print("Using GPU for SpaCy")
    else:
        print("Using CPU for SpaCy")
    return spacy.load("en_core_web_sm")

# -------------------------------
# Text Preprocessor
class TextPreprocessor:
    def __init__(self):
        self.nlp = setup_spacy()

    def clean_text(self, text: str) -> str:
        text = BeautifulSoup(text, "lxml").get_text()
        text = re.sub(r"http\S+|www\.\S+", "", text)
        text = re.sub(r"\S+@\S+", "", text)
        text = re.sub(r"[^A-Za-z0-9\s.,;:?!'-]", "", text)
        return text

    def normalize_unicode(self, text: str) -> str:
        return unicodedata.normalize('NFKD', text)

    def normalize_case(self, text: str) -> str:
        return text.lower()

    def process_text(self, text: str):
        if not isinstance(text, str) or not text.strip():
            return {"sentences": [""], "full_text": ""}  # Return placeholder for empty text
        cleaned = self.clean_text(text)
        normalized = self.normalize_case(self.normalize_unicode(cleaned))
        doc = self.nlp(normalized)
        sentences = [sent.text.strip() for sent in doc.sents]
        
        # Ensure at least one sentence exists
        if not sentences:
            sentences = [""]  # Add empty placeholder
            
        return {"sentences": sentences, "full_text": normalized}





class EmbeddingGenerator:
    def __init__(self, model_name='roberta-base', output_dim=50):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device).eval()
        self.output_dim = output_dim
        self.model_name = model_name
        
        for param in self.model.parameters():
            param.requires_grad = False

        # Add projection layer to reduce from RoBERTa's 768 dimensions to output_dim
        self.projection = nn.Linear(768, output_dim).to(self.device)

        # Initialize the projection weights - optional but can help with convergence
        nn.init.xavier_uniform_(self.projection.weight)
        nn.init.zeros_(self.projection.bias)

    def get_embedding(self, text, pooling='cls'):
        if not text or not isinstance(text, str) or not text.strip():
            return np.zeros(self.output_dim)  # Return zero vector for empty text

        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True).to(self.device)
        with torch.no_grad():  # This ensures no gradients are tracked
            outputs = self.model(**inputs)

            if pooling == 'cls':
                embedding = outputs.last_hidden_state[:, 0, :]
            else:
                embedding = outputs.last_hidden_state.mean(dim=1)

            # Project down to output_dim
            reduced_embedding = self.projection(embedding)

            # Use detach() before converting to numpy
            return reduced_embedding.detach().cpu().numpy()[0]

    def batch_generate_embeddings(self, texts, pooling="cls", batch_size=8):
        """Generate embeddings in smaller batches to avoid memory issues"""
        all_embeddings = []

        # Convert to list if it's a pandas Series
        if isinstance(texts, pd.Series):
            texts = texts.tolist()

        # Process in smaller batches
        for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
            batch_texts = texts[i : i + batch_size]

            # Skip empty batches
            if not batch_texts:
                continue

            # Filter out empty texts
            valid_indices = []
            valid_texts = []
            for j, text in enumerate(batch_texts):
                if isinstance(text, str) and text.strip():
                    valid_indices.append(j)
                    valid_texts.append(text)

            if not valid_texts:
                # Add zeros for empty batch
                batch_embeddings = np.zeros((len(batch_texts), self.output_dim))
            else:
                # Process valid texts
                inputs = self.tokenizer(
                    valid_texts,
                    return_tensors="pt",
                    truncation=True,
                    max_length=512,
                    padding=True,
                ).to(self.device)

                with torch.no_grad():
                    outputs = self.model(**inputs)

                # Pooling
                if pooling == "cls":
                    embeddings = outputs.last_hidden_state[:, 0, :]
                else:
                    embeddings = outputs.last_hidden_state.mean(dim=1)

                # Project down to output_dim - ADD DETACH HERE
                reduced_embeddings = self.projection(embeddings).detach().cpu().numpy()

                # Create batch embeddings with zeros for invalid texts
                batch_embeddings = np.zeros((len(batch_texts), self.output_dim))
                for k, idx in enumerate(valid_indices):
                    batch_embeddings[idx] = reduced_embeddings[k]

            all_embeddings.append(batch_embeddings)

            # Free up memory
            torch.cuda.empty_cache()

        # Concatenate all batches
        if all_embeddings:
            return np.vstack(all_embeddings)
        else:
            return np.empty((0, self.output_dim))
            
    def save(self, save_path):
        """Save the embedding generator to disk"""
        save_dict = {
            "model_name": self.model_name,
            "output_dim": self.output_dim,
            "projection_state": self.projection.state_dict()
        }
        torch.save(save_dict, save_path)
        print(f"EmbeddingGenerator saved to {save_path}")
    
    @classmethod
    def load(cls, save_path):
        """Load an embedding generator from disk"""
        save_dict = torch.load(save_path, map_location=torch.device('cpu'))
        
        # Create a new instance
        instance = cls(
            model_name=save_dict["model_name"],
            output_dim=save_dict["output_dim"]
        )
        
        # Load the projection weights
        instance.projection.load_state_dict(save_dict["projection_state"])
        
        print(f"EmbeddingGenerator loaded from {save_path}")
        return instance


# -------------------------------
# Dataset for training
class TextDataset(Dataset):
    def __init__(self, sentence_embeds, full_text_embeds, labels):
        self.sentence_embeds = sentence_embeds  # List of tensors with variable lengths
        self.full_text_embeds = full_text_embeds  # Single tensor
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Ensure consistent float32 dtype
        sent_embed = self.sentence_embeds[idx]
        if sent_embed.dtype != torch.float32:
            sent_embed = sent_embed.to(torch.float32)
            
        full_embed = self.full_text_embeds[idx]
        if full_embed.dtype != torch.float32:
            full_embed = full_embed.to(torch.float32)
            
        return {
            "sentence_embeds": sent_embed,
            "full_text_embeds": full_embed,
            "label": torch.tensor(self.labels[idx], dtype=torch.float32)
        }

# Updated collate function with consistent data types
def collate_fn(batch):
    # Extract components from the batch
    sentence_embeds = [item["sentence_embeds"] for item in batch]
    full_text_embeds = torch.stack([item["full_text_embeds"] for item in batch])
    labels = torch.tensor([item["label"] for item in batch], dtype=torch.float32)
    
    # Get the maximum number of sentences in this batch
    max_sentences = max(embed.size(0) for embed in sentence_embeds)
    
    # Create a padded tensor for sentence embeddings with explicit dtype
    batch_size = len(batch)
    padded_sentence_embeds = torch.zeros(batch_size, max_sentences, EMBED_DIM, dtype=torch.float32)
    
    # Fill the padded tensor
    for i, embed in enumerate(sentence_embeds):
        num_sentences = embed.size(0)
        padded_sentence_embeds[i, :num_sentences, :] = embed.to(torch.float32)
    
    return {
        "sentence_embeds": padded_sentence_embeds.to(device),
        "full_text_embeds": full_text_embeds.to(device),
        "label": labels.to(device)
    }

class BiLSTMModel(nn.Module):
    def __init__(self, input_dim=EMBED_DIM, hidden_dim=32, num_layers=2):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, 
                          batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, 1)
        
    def forward(self, x):
        # Avoid boolean tensor checks and use size() directly
        if x.size(0) == 0:  # Empty batch
            return torch.zeros(0, 1, device=x.device)
            
        if x.size(1) == 0:  # No sentences
            return torch.zeros(x.size(0), 1, device=x.device)
            
        # Use .shape for clarity and safe checking
        batch_size = x.shape[0]
        seq_len = x.shape[1]
        
        if seq_len == 0:
            return torch.zeros(batch_size, 1, device=x.device)
            
        outputs, (hidden, _) = self.lstm(x)
        
        # Explicitly reshape for safety
        h = torch.cat((hidden[-2], hidden[-1]), dim=1)
        out = self.fc(h)
        return torch.sigmoid(out)

# Add the missing MLPModel class
class MLPModel(nn.Module):
    def __init__(self, input_dim=EMBED_DIM, hidden_dim=32):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.fc3 = nn.Linear(hidden_dim // 2, 1)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        return torch.sigmoid(self.fc3(x))

# Fix the WeightedEnsemble class
class WeightedEnsemble(nn.Module):
    def __init__(self):
        super().__init__()
        self.mlp = MLPModel()
        self.bilstm = BiLSTMModel()
        self.weight = nn.Parameter(torch.tensor(0.5))
        
    def forward(self, sentence_embeds, full_text_embeds):
        # Process through MLP
        mlp_out = self.mlp(full_text_embeds)
        
        # Process through BiLSTM
        bilstm_out = self.bilstm(sentence_embeds)
        
        # Ensure consistent shapes
        batch_size = full_text_embeds.size(0)
        mlp_out = mlp_out.view(batch_size)
        bilstm_out = bilstm_out.view(batch_size)
        
        # Weighted combination
        w = torch.sigmoid(self.weight)
        return w * mlp_out + (1-w) * bilstm_out

def train_model(model, dataloader, epochs=10, lr=0.001):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCELoss()
    device = next(model.parameters()).device
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        correct = 0
        total = 0
        
        for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
            optimizer.zero_grad()
            sentence_input = batch["sentence_embeds"].to(device)     # (B, num_sentences, EMBED_DIM)
            full_text_input = batch["full_text_embeds"].to(device)   # (B, EMBED_DIM)
            labels = batch["label"].to(device).view(-1)              # (B,)
            
            # Handle potential empty batches safely
            if sentence_input.shape[0] == 0:
                continue  # Skip empty batches
                
            outputs = model(sentence_input, full_text_input).squeeze()
            
            # Ensure output and label shapes match
            if outputs.shape != labels.shape:
                outputs = outputs.view(-1)
                
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            # Calculate accuracy - don't use boolean operations on tensors
            predicted = torch.zeros_like(outputs)
            predicted[outputs > 0.5] = 1.0
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
            
        epoch_loss = total_loss / len(dataloader)
        epoch_acc = correct / total * 100
        print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%")
        
        # Save checkpoint after each epoch
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': epoch_loss,
        }, f"model_checkpoint_epoch_{epoch+1}.pt")
    
    print("Training complete.")

# -------------------------------
# Evaluation Function
def evaluate_model(model, dataloader):
    model.eval()
    preds, true_labels = [], []
    device = next(model.parameters()).device
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            sentence_input = batch["sentence_embeds"].to(device)
            full_text_input = batch["full_text_embeds"].to(device)
            labels = batch["label"].to(device)
            
            outputs = model(sentence_input, full_text_input).squeeze()
            
            # Fix: Convert to numpy and ensure it's always a list/array
            if outputs.ndim == 0:  # If output is a scalar
                preds.append(outputs.cpu().item())  # Use item() for scalar values
            else:
                preds.extend(outputs.cpu().numpy())
                
            # Do the same for labels
            if labels.ndim == 0:
                true_labels.append(labels.cpu().item())
            else:
                true_labels.extend(labels.cpu().numpy())
    
    # Binarize predictions with threshold 0.5
    preds_binary = [1 if p > 0.5 else 0 for p in preds]
    acc = accuracy_score(true_labels, preds_binary)
    f1 = f1_score(true_labels, preds_binary)
    auc = roc_auc_score(true_labels, preds)
    
    print("Test Results:")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC-ROC: {auc:.4f}")
    
    return {"accuracy": acc, "f1": f1, "auc": auc}

# -------------------------------
# Testing Function for a single text sample
def test_model(model, text_data, preprocessor, embedding_generator):
    model.eval()
    device = next(model.parameters()).device
    
    with torch.no_grad():
        processed = preprocessor.process_text(text_data)
        
        # Handle empty text
        if not processed["sentences"]:
            print("Empty text provided.")
            return 0.0
            
        # Sentence embeddings: list of embeddings for each sentence
        sentence_embeds = [embedding_generator.get_embedding(sent, pooling='cls') for sent in processed["sentences"]]
        sentence_embeds = torch.tensor(sentence_embeds, dtype=torch.float32).unsqueeze(0).to(device)
        
        # Full text embedding
        full_text_embeds = torch.tensor(embedding_generator.get_embedding(processed["full_text"], pooling='cls'),
                                      dtype=torch.float32).unsqueeze(0).to(device)
        
        prob = model(sentence_embeds, full_text_embeds).item()
        print(f"Probability of AI-generated text: {prob:.4f}")
        print("Prediction:", "AI-Generated" if prob > 0.5 else "Human-Written")
        
        return prob

All random seeds set to 42


In [2]:


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
embedding_generator = EmbeddingGenerator(model_name='roberta-base',output_dim=EMBED_DIM)

preprocessor = TextPreprocessor()
try:
    df = pd.read_csv("/kaggle/input/combined-dataset-ai-human/combined_data.csv")
    print(f"Loaded dataset with {len(df)} samples")
except FileNotFoundError:
    print("train.csv file not found.")
    exit()
# Process texts using the two columns 'text' and 'ai'
print("Processing texts...")
processed_data = [preprocessor.process_text(text) for text in tqdm(df['text'], desc="Processing texts")]

# Check for empty document counts
empty_count = sum(1 for p in processed_data if not p["sentences"] or p["sentences"] == [""]) 
print(f"Found {empty_count} documents with no sentences out of {len(processed_data)} total")

df["processed_sentences"] = [p["sentences"] for p in processed_data]
df["processed_full_text"] = [p["full_text"] for p in processed_data]
df.to_csv("train_processed.csv", index=False)
print("Preprocessing complete. Saved to 'train_processed.csv'.")
# Generate sentence embeddings with memory-efficient approach
print("Generating sentence embeddings...")
sentence_embeddings = []
for sentences in tqdm(df["processed_sentences"], desc="Sentence Embeddings"):
    # Generate embeddings for each sentence in this document
    doc_embeds = []
    for sent in sentences:
        if sent:  # Skip empty sentences
            embed = embedding_generator.get_embedding(sent, pooling='cls')
            doc_embeds.append(embed)
    
    # Ensure at least one embedding for empty documents
    if not doc_embeds:
        doc_embeds = [np.zeros(EMBED_DIM)]  # Add a dummy embedding
        
    sentence_embeddings.append(doc_embeds)
    
    # Periodically save to avoid losing progress
    # if len(sentence_embeddings) % 100 == 0:
    #     torch.save(sentence_embeddings, "sentence_embeddings_partial.pt")

# Save final sentence embeddings
torch.save(sentence_embeddings, "sentence_embeddings.pt")
print("Sentence embeddings saved to 'sentence_embeddings.pt'")


# Generate full text embeddings with memory-efficient batching
print("Generating full text embeddings...")
BATCH_SIZE = 64  # Adjust based on your GPU memory capacity
df = pd.read_csv("/kaggle/input/train-processed-data/train_processed.csv")
# Process full text embeddings in small batches
full_text_embeddings = embedding_generator.batch_generate_embeddings(
    df["processed_full_text"], 
    pooling='cls',
    batch_size=BATCH_SIZE
)

torch.save(full_text_embeddings, "full_text_embeddings.pt")
print("Full text embeddings saved to 'full_text_embeddings.pt'")
# Labels are assumed to be in the "ai" column (1 for AI, 0 for human)

embedding_generator.save("embedding_generator.pt")


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using GPU for SpaCy
Loaded dataset with 47026 samples
Processing texts...


Processing texts:   0%|          | 0/47026 [00:00<?, ?it/s]

Found 0 documents with no sentences out of 47026 total
Preprocessing complete. Saved to 'train_processed.csv'.
Generating sentence embeddings...


Sentence Embeddings:   0%|          | 0/47026 [00:00<?, ?it/s]

Sentence embeddings saved to 'sentence_embeddings.pt'
Generating full text embeddings...


Processing batches:   0%|          | 0/735 [00:00<?, ?it/s]

Full text embeddings saved to 'full_text_embeddings.pt'
EmbeddingGenerator saved to embedding_generator.pt


In [3]:
def load_and_convert_embeddings( device, batch_size=1000):
    """Load and convert embeddings in memory-efficient batches with consistent dtype"""
    # 1. First load data to CPU
    # sentence_embeddings = torch.load('/kaggle/input/sentence-linear-embedding/sentence_embeddings.pt', map_location='cpu')
    # full_text_embeddings = torch.load(fulltext_path, map_location='cpu')
    
    global sentence_embeddings, full_text_embeddings
    # global full_text_embeddings
    
    num_docs = len(sentence_embeddings)
    
    # 2. Convert full text embeddings with explicit dtype
    full_tensor = torch.tensor(full_text_embeddings, dtype=torch.float32, device='cpu')
    
    # 3. Process sentence embeddings without padding
    sentence_list = []  # Will store variable-length tensors
    
    for i in tqdm(range(0, num_docs, batch_size), desc="Processing documents"):
        batch_end = min(i + batch_size, num_docs)
        
        # Process batch documents
        batch_sentences = []
        for j in range(i, batch_end):
            doc = sentence_embeddings[j]
            if len(doc) > 0:
                # Convert each document to a tensor with explicit dtype
                doc_tensor = torch.stack([
                    (sent if torch.is_tensor(sent) else torch.tensor(sent, dtype=torch.float32))
                    for sent in doc
                ])
                # Ensure float32
                doc_tensor = doc_tensor.to(torch.float32)
            else:
                # Empty document - create zero tensor with explicit dtype
                doc_tensor = torch.zeros(1, EMBED_DIM, dtype=torch.float32)
            
            batch_sentences.append(doc_tensor)
        
        sentence_list.extend(batch_sentences)
    
    return sentence_list, full_tensor

# Usage
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 500  # Adjust based on your GPU memory

sentence_list, full_tensor = load_and_convert_embeddings(
    device=device,
    batch_size=batch_size
)

# Example access:
print(f"Number of documents: {len(sentence_list)}")
print(f"First document shape: {sentence_list[0].shape}")  # [num_sentences, EMBED_DIM]
print(f"Full text embeddings shape: {full_tensor.shape}")  # [num_docs, EMBED_DIM]

Processing documents:   0%|          | 0/95 [00:00<?, ?it/s]

Number of documents: 47026
First document shape: torch.Size([19, 30])
Full text embeddings shape: torch.Size([47026, 30])


In [4]:
# def load_and_convert_embeddings(sentence_path, fulltext_path, device, batch_size=1000):
#     """Load and convert embeddings in memory-efficient batches with consistent dtype"""
#     # 1. First load data to CPU
#     sentence_embeddings = torch.load(sentence_path, map_location='cpu')
#     full_text_embeddings = torch.load(fulltext_path, map_location='cpu')
    
#     num_docs = len(sentence_embeddings)
    
#     # 2. Convert full text embeddings with explicit dtype
#     full_tensor = torch.tensor(full_text_embeddings, dtype=torch.float32, device='cpu')
    
#     # 3. Process sentence embeddings without padding
#     sentence_list = []  # Will store variable-length tensors
    
#     for i in tqdm(range(0, num_docs, batch_size), desc="Processing documents"):
#         batch_end = min(i + batch_size, num_docs)
        
#         # Process batch documents
#         batch_sentences = []
#         for j in range(i, batch_end):
#             doc = sentence_embeddings[j]
#             if len(doc) > 0:
#                 # Convert each document to a tensor with explicit dtype
#                 doc_tensor = torch.stack([
#                     (sent if torch.is_tensor(sent) else torch.tensor(sent, dtype=torch.float32))
#                     for sent in doc
#                 ])
#                 # Ensure float32
#                 doc_tensor = doc_tensor.to(torch.float32)
#             else:
#                 # Empty document - create zero tensor with explicit dtype
#                 doc_tensor = torch.zeros(1, EMBED_DIM, dtype=torch.float32)
            
#             batch_sentences.append(doc_tensor)
        
#         sentence_list.extend(batch_sentences)
    
#     return sentence_list, full_tensor

# # Usage
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# batch_size = 500  # Adjust based on your GPU memory

# sentence_list, full_tensor = load_and_convert_embeddings(
#     "/kaggle/input/sentence-linear-embedding/sentence_embeddings.pt",
#     "/kaggle/input/full-text-embedding/full_text_embeddings.pt",
#     device=device,
#     batch_size=batch_size
# )

# # Example access:
# print(f"Number of documents: {len(sentence_list)}")
# print(f"First document shape: {sentence_list[0].shape}")  # [num_sentences, EMBED_DIM]
# print(f"Full text embeddings shape: {full_tensor.shape}")  # [num_docs, EMBED_DIM]

In [5]:
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device count: {torch.cuda.device_count()}")
print(f"Current device: {torch.cuda.current_device()}")
torch.autograd.set_detect_anomaly(True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
df = pd.read_csv("/kaggle/input/combined-dataset-ai-human/combined_data.csv")
labels = df["ai"].values.astype(np.float32)
# sentence_embeddings = torch.load("/kaggle/input/sentence-embed/sentence_embeddings.pt", map_location="cuda")
# full_text_embeddings = torch.load("/kaggle/input/full-test-emd/full_text_embeddings.pt", map_location="cuda")
sentence_embeddings=sentence_list
full_text_embeddings=full_tensor
# Create the dataset
dataset = TextDataset(sentence_embeddings, full_text_embeddings, labels)
# Perform an 80:20 train/test split (using stratification for balanced classes)
indices = list(range(len(dataset)))
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42, stratify=labels)
train_dataset = Subset(dataset, train_idx)
test_dataset = Subset(dataset, test_idx)

# Configure data loaders with custom collate function to handle variable-length sentence embeddings
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False, collate_fn=collate_fn)
# Initialize and train the ensemble model
try:
    model = WeightedEnsemble().to(device)
    print("Model successfully moved to device")  
except Exception as e:
    print(f"Error initializing model: {e}")


try:
    print("Starting model training...")
    train_model(model, train_dataloader, epochs=50, lr=0.001)
    # Save the trained model
    torch.save(model.state_dict(), "ai_detector_model.pt")
    print("Model saved to 'ai_detector_model.pt'")
except Exception as e:
    print(f"Error in Training the model: {e}")
    


CUDA available: True
Device count: 2
Current device: 0
Model successfully moved to device
Starting model training...


Epoch 1/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 1/50, Loss: 0.5900, Accuracy: 70.39%


Epoch 2/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 2/50, Loss: 0.5388, Accuracy: 73.87%


Epoch 3/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 3/50, Loss: 0.4763, Accuracy: 77.34%


Epoch 4/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 4/50, Loss: 0.4425, Accuracy: 79.47%


Epoch 5/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 5/50, Loss: 0.4208, Accuracy: 81.07%


Epoch 6/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 6/50, Loss: 0.4142, Accuracy: 81.25%


Epoch 7/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 7/50, Loss: 0.4071, Accuracy: 81.87%


Epoch 8/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 8/50, Loss: 0.4011, Accuracy: 82.24%


Epoch 9/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 9/50, Loss: 0.3925, Accuracy: 82.81%


Epoch 10/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 10/50, Loss: 0.3913, Accuracy: 82.82%


Epoch 11/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 11/50, Loss: 0.3872, Accuracy: 83.05%


Epoch 12/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 12/50, Loss: 0.3782, Accuracy: 83.62%


Epoch 13/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 13/50, Loss: 0.3805, Accuracy: 83.35%


Epoch 14/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 14/50, Loss: 0.3754, Accuracy: 83.96%


Epoch 15/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 15/50, Loss: 0.3743, Accuracy: 83.72%


Epoch 16/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 16/50, Loss: 0.3754, Accuracy: 83.62%


Epoch 17/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 17/50, Loss: 0.3727, Accuracy: 83.80%


Epoch 18/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 18/50, Loss: 0.3696, Accuracy: 84.11%


Epoch 19/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 19/50, Loss: 0.3680, Accuracy: 84.26%


Epoch 20/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 20/50, Loss: 0.3653, Accuracy: 84.45%


Epoch 21/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 21/50, Loss: 0.3677, Accuracy: 84.25%


Epoch 22/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 22/50, Loss: 0.3653, Accuracy: 84.28%


Epoch 23/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 23/50, Loss: 0.3627, Accuracy: 84.35%


Epoch 24/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 24/50, Loss: 0.3667, Accuracy: 84.17%


Epoch 25/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 25/50, Loss: 0.3624, Accuracy: 84.71%


Epoch 26/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 26/50, Loss: 0.3624, Accuracy: 84.38%


Epoch 27/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 27/50, Loss: 0.3608, Accuracy: 84.62%


Epoch 28/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 28/50, Loss: 0.3639, Accuracy: 84.38%


Epoch 29/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 29/50, Loss: 0.3609, Accuracy: 84.64%


Epoch 30/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 30/50, Loss: 0.3658, Accuracy: 84.31%


Epoch 31/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 31/50, Loss: 0.3559, Accuracy: 84.89%


Epoch 32/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 32/50, Loss: 0.3550, Accuracy: 84.95%


Epoch 33/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 33/50, Loss: 0.3545, Accuracy: 84.95%


Epoch 34/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 34/50, Loss: 0.3572, Accuracy: 84.79%


Epoch 35/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 35/50, Loss: 0.3538, Accuracy: 84.73%


Epoch 36/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 36/50, Loss: 0.3546, Accuracy: 84.96%


Epoch 37/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 37/50, Loss: 0.3543, Accuracy: 84.75%


Epoch 38/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 38/50, Loss: 0.3518, Accuracy: 85.02%


Epoch 39/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 39/50, Loss: 0.3561, Accuracy: 84.87%


Epoch 40/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 40/50, Loss: 0.3517, Accuracy: 85.14%


Epoch 41/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 41/50, Loss: 0.3533, Accuracy: 84.91%


Epoch 42/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 42/50, Loss: 0.3494, Accuracy: 85.34%


Epoch 43/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 43/50, Loss: 0.3496, Accuracy: 85.16%


Epoch 44/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 44/50, Loss: 0.3462, Accuracy: 85.42%


Epoch 45/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 45/50, Loss: 0.3485, Accuracy: 85.23%


Epoch 46/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 46/50, Loss: 0.3474, Accuracy: 85.32%


Epoch 47/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 47/50, Loss: 0.3437, Accuracy: 85.42%


Epoch 48/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 48/50, Loss: 0.3466, Accuracy: 85.32%


Epoch 49/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 49/50, Loss: 0.3486, Accuracy: 85.27%


Epoch 50/50:   0%|          | 0/294 [00:00<?, ?it/s]

Epoch 50/50, Loss: 0.3455, Accuracy: 85.45%
Training complete.
Model saved to 'ai_detector_model.pt'


In [6]:
# Evaluate the model on the test set
print("Evaluating model on test set...")
results = evaluate_model(model, test_dataloader)



Evaluating model on test set...


Evaluating:   0%|          | 0/74 [00:00<?, ?it/s]

Test Results:
Accuracy: 0.8609
F1 Score: 0.8610
AUC-ROC: 0.9325


In [7]:
loaded_embedding_gen = EmbeddingGenerator.load("/kaggle/working/embedding_generator.pt")

# Generate embedding for a single sentence
test_sentence = "This is a sample sentence for testing."
embedding = loaded_embedding_gen.get_embedding(test_sentence)

  save_dict = torch.load(save_path, map_location=torch.device('cpu'))
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EmbeddingGenerator loaded from /kaggle/working/embedding_generator.pt
