# Experiment 3: Cresci-2017 Data Loading & Neural Network Architecture
Integration of real dataset with optimized transformer architecture

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
import math
from pathlib import Path
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


## Cresci-2017 Dataset Structure Analysis

In [2]:
# Dataset metadata from the paper
CRESCI_DATASETS = {
    'genuine_accounts': {'accounts': 3474, 'tweets': 8377522, 'year': 2011, 'label': 0},
    'social_spambots_1': {'accounts': 991, 'tweets': 1610176, 'year': 2012, 'label': 1},
    'social_spambots_2': {'accounts': 3457, 'tweets': 428542, 'year': 2014, 'label': 2}, 
    'social_spambots_3': {'accounts': 464, 'tweets': 1418626, 'year': 2011, 'label': 3},
    'traditional_spambots_1': {'accounts': 1000, 'tweets': 145094, 'year': 2009, 'label': 4},
    'traditional_spambots_2': {'accounts': 100, 'tweets': 74957, 'year': 2014, 'label': 5},
    'traditional_spambots_3': {'accounts': 433, 'tweets': 5794931, 'year': 2013, 'label': 6},
    'traditional_spambots_4': {'accounts': 1128, 'tweets': 133311, 'year': 2009, 'label': 7},
    'fake_followers': {'accounts': 3351, 'tweets': 196027, 'year': 2012, 'label': 8}
}

# Calculate distribution
total_accounts = sum(d['accounts'] for d in CRESCI_DATASETS.values())
total_tweets = sum(d['tweets'] for d in CRESCI_DATASETS.values())

print("CRESCI-2017 DATASET BREAKDOWN")
print("=" * 50)
print(f"Total accounts: {total_accounts:,}")
print(f"Total tweets: {total_tweets:,}")
print(f"\nClass distribution:")

for name, data in CRESCI_DATASETS.items():
    acc_pct = data['accounts'] / total_accounts * 100
    tweet_pct = data['tweets'] / total_tweets * 100
    print(f"  {name:<22}: {data['accounts']:>5} acc ({acc_pct:>4.1f}%) | {data['tweets']:>8} tweets ({tweet_pct:>4.1f}%)")

print(f"\nClass imbalance issues:")
print(f"  Largest: genuine_accounts (8.3M tweets, 47.1%)")
print(f"  Smallest: traditional_spambots_2 (75K tweets, 0.4%)")
print(f"  Imbalance ratio: {8377522/74957:.1f}:1")

CRESCI-2017 DATASET BREAKDOWN
Total accounts: 14,398
Total tweets: 18,179,186

Class distribution:
  genuine_accounts      :  3474 acc (24.1%) |  8377522 tweets (46.1%)
  social_spambots_1     :   991 acc ( 6.9%) |  1610176 tweets ( 8.9%)
  social_spambots_2     :  3457 acc (24.0%) |   428542 tweets ( 2.4%)
  social_spambots_3     :   464 acc ( 3.2%) |  1418626 tweets ( 7.8%)
  traditional_spambots_1:  1000 acc ( 6.9%) |   145094 tweets ( 0.8%)
  traditional_spambots_2:   100 acc ( 0.7%) |    74957 tweets ( 0.4%)
  traditional_spambots_3:   433 acc ( 3.0%) |  5794931 tweets (31.9%)
  traditional_spambots_4:  1128 acc ( 7.8%) |   133311 tweets ( 0.7%)
  fake_followers        :  3351 acc (23.3%) |   196027 tweets ( 1.1%)

Class imbalance issues:
  Largest: genuine_accounts (8.3M tweets, 47.1%)
  Smallest: traditional_spambots_2 (75K tweets, 0.4%)
  Imbalance ratio: 111.8:1


## Strategic Data Loading Pipeline

In [None]:
class CresciDataLoader:
    def __init__(self, data_root="../datasets/datasets_full.csv/"):
        self.data_root = Path(data_root)
        self.datasets = CRESCI_DATASETS
        
    def load_dataset_tweets(self, dataset_name, sample_size=None):
        """Load tweets from a specific dataset with optional sampling"""
        tweet_path = self.data_root / f"{dataset_name}.csv" / "tweets.csv"
        
        if not tweet_path.exists():
            print(f"Warning: {tweet_path} not found")
            return pd.DataFrame()
        
        try:
            df = pd.read_csv(tweet_path)
            
            # Sample if specified (for managing class imbalance)
            if sample_size and len(df) > sample_size:
                df = df.sample(n=sample_size, random_state=42)
            
            # Add metadata
            df['dataset'] = dataset_name
            df['binary_label'] = 0 if dataset_name == 'genuine_accounts' else 1  # Human=0, Bot=1
            df['multiclass_label'] = self.datasets[dataset_name]['label']
            df['year'] = self.datasets[dataset_name]['year']
            
            print(f"Loaded {len(df):,} tweets from {dataset_name}")
            return df
            
        except Exception as e:
            print(f"Error loading {dataset_name}: {e}")
            return pd.DataFrame()
    
    def create_balanced_dataset(self, strategy='undersample', max_per_class=50000):
        """Create balanced dataset using different strategies"""
        all_data = []
        
        print(f"\nCreating balanced dataset with strategy: {strategy}")
        print(f"Max samples per class: {max_per_class:,}")
        
        for dataset_name in self.datasets.keys():
            # Load with sampling to balance classes
            if strategy == 'undersample':
                sample_size = min(max_per_class, self.datasets[dataset_name]['tweets'])
            else:  # 'full'
                sample_size = None
                
            df = self.load_dataset_tweets(dataset_name, sample_size)
            
            if not df.empty:
                all_data.append(df)
        
        if not all_data:
            print("No data loaded!")
            return pd.DataFrame()
        
        # Combine all datasets
        combined_df = pd.concat(all_data, ignore_index=True)
        
        # Clean text column
        if 'text' not in combined_df.columns:
            print("Available columns:", combined_df.columns.tolist())
            return combined_df
        
        # Remove empty tweets
        combined_df = combined_df.dropna(subset=['text'])
        combined_df = combined_df[combined_df['text'].str.strip() != '']
        
        print(f"\nFinal dataset: {len(combined_df):,} tweets")
        print(f"Binary distribution:")
        print(combined_df['binary_label'].value_counts().sort_index())
        print(f"\nMulticlass distribution:")
        print(combined_df.groupby(['dataset', 'multiclass_label']).size())
        
        return combined_df

# Initialize data loader
data_loader = CresciDataLoader()

# Create balanced dataset (start small for testing)
cresci_df = data_loader.create_balanced_dataset(strategy='undersample', max_per_class=10000)

## Optimized Transformer Architecture

In [None]:
class OptimalBotConfig:
    # Architecture (from our optimization)
    d_model = 512
    num_layers = 9
    num_heads = 12
    d_ff = 2048
    dropout = 0.15
    max_seq_length = 128
    
    # Tasks
    num_binary_classes = 2      # Human vs Bot
    num_multiclass_classes = 9  # All categories
    
    # Training
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config = OptimalBotConfig()

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        
        self.qkv_projection = nn.Linear(d_model, 3 * d_model)
        self.output_projection = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        batch_size, seq_len, d_model = x.size()
        
        # Project and reshape for multi-head attention
        qkv = self.qkv_projection(x)
        qkv = qkv.reshape(batch_size, seq_len, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        
        q, k, v = qkv.chunk(3, dim=-1)
        
        # Scaled dot-product attention
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
            
        attention_weights = F.softmax(scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        
        attended_values = torch.matmul(attention_weights, v)
        attended_values = attended_values.permute(0, 2, 1, 3).contiguous()
        attended_values = attended_values.reshape(batch_size, seq_len, d_model)
        
        output = self.output_projection(attended_values)
        return output

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x))))

class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        # Self-attention with residual connection
        attn_output = self.self_attention(x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Feed forward with residual connection
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        
        return x

print(f"Architecture components defined for {config.d_model}d model with {config.num_heads} heads")

## Complete Bot Detection Model

In [None]:
class OptimizedBotDetector(nn.Module):
    def __init__(self, config, vocab_size=50265):  # RoBERTa vocab size
        super().__init__()
        self.config = config
        
        # Embeddings (we'll replace with pre-trained later)
        self.token_embedding = nn.Embedding(vocab_size, config.d_model)
        self.position_embedding = nn.Embedding(config.max_seq_length, config.d_model)
        
        # Transformer layers
        self.encoder_layers = nn.ModuleList([
            TransformerEncoderLayer(
                config.d_model, 
                config.num_heads, 
                config.d_ff, 
                config.dropout
            ) for _ in range(config.num_layers)
        ])
        
        # Classification heads
        self.layer_norm = nn.LayerNorm(config.d_model)
        self.dropout = nn.Dropout(config.dropout)
        
        # Binary classification (Human vs Bot)
        self.binary_classifier = nn.Linear(config.d_model, config.num_binary_classes)
        
        # Multiclass classification (Bot types) - for future use
        self.multiclass_classifier = nn.Linear(config.d_model, config.num_multiclass_classes)
        
        # Initialize weights
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            nn.init.ones_(module.weight)
            nn.init.zeros_(module.bias)
    
    def forward(self, input_ids, attention_mask=None, task='binary'):
        batch_size, seq_len = input_ids.size()
        device = input_ids.device
        
        # Embeddings
        token_embeddings = self.token_embedding(input_ids)
        position_ids = torch.arange(seq_len, device=device).unsqueeze(0).expand(batch_size, -1)
        position_embeddings = self.position_embedding(position_ids)
        
        embeddings = token_embeddings + position_embeddings
        embeddings = embeddings * math.sqrt(self.config.d_model)  # Scale
        
        # Create attention mask for padding
        if attention_mask is not None:
            # Convert to format expected by attention layers
            extended_mask = attention_mask.unsqueeze(1).unsqueeze(2)
            extended_mask = (1.0 - extended_mask) * -1e9
        else:
            extended_mask = None
        
        # Pass through transformer layers
        hidden_states = embeddings
        for layer in self.encoder_layers:
            hidden_states = layer(hidden_states, extended_mask)
        
        # Get [CLS] token representation (first token)
        cls_representation = hidden_states[:, 0, :]
        cls_representation = self.layer_norm(cls_representation)
        cls_representation = self.dropout(cls_representation)
        
        # Classification based on task
        if task == 'binary':
            logits = self.binary_classifier(cls_representation)
        elif task == 'multiclass':
            logits = self.multiclass_classifier(cls_representation)
        else:
            # Return both for multi-task training (future)
            return {
                'binary_logits': self.binary_classifier(cls_representation),
                'multiclass_logits': self.multiclass_classifier(cls_representation)
            }
        
        return logits
    
    def get_num_parameters(self):
        total_params = sum(p.numel() for p in self.parameters())
        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
        return total_params, trainable_params

# Create model
model = OptimizedBotDetector(config)
model = model.to(config.device)

total_params, trainable_params = model.get_num_parameters()
print(f"\nModel created:")
print(f"  Parameters: {total_params:,} total ({trainable_params:,} trainable)")
print(f"  Architecture: {config.d_model}d × {config.num_layers}L × {config.num_heads}H")
print(f"  Device: {config.device}")

# Test forward pass
test_input = torch.randint(0, 1000, (2, config.max_seq_length), device=config.device)
test_mask = torch.ones_like(test_input, device=config.device)

with torch.no_grad():
    test_output = model(test_input, test_mask, task='binary')
    print(f"\nTest forward pass: {test_input.shape} → {test_output.shape}")
    print(f"Output range: [{test_output.min().item():.3f}, {test_output.max().item():.3f}]")

## Integration with Subword Tokenizer

In [None]:
# Use Twitter-RoBERTa tokenizer from experiment2
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base")

class CresciDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128, task='binary'):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.task = task
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        if self.task == 'binary':
            label = self.labels['binary'][idx]
        else:
            label = self.labels['multiclass'][idx]
        
        # Tokenize
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Test with real data if available
if not cresci_df.empty and 'text' in cresci_df.columns:
    sample_texts = cresci_df['text'].head(100).tolist()
    sample_labels = {
        'binary': cresci_df['binary_label'].head(100).tolist(),
        'multiclass': cresci_df['multiclass_label'].head(100).tolist()
    }
    
    # Create dataset
    test_dataset = CresciDataset(sample_texts, sample_labels, tokenizer, task='binary')
    test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=True)
    
    # Test batch
    sample_batch = next(iter(test_dataloader))
    print(f"\nReal data batch test:")
    print(f"  Input shape: {sample_batch['input_ids'].shape}")
    print(f"  Labels: {sample_batch['labels'][:5].tolist()}")
    
    # Test model prediction
    with torch.no_grad():
        batch_input = sample_batch['input_ids'].to(config.device)
        batch_mask = sample_batch['attention_mask'].to(config.device)
        predictions = model(batch_input, batch_mask, task='binary')
        probabilities = F.softmax(predictions, dim=1)
        
    print(f"  Predictions shape: {predictions.shape}")
    print(f"  Sample probabilities: {probabilities[:3].cpu().numpy()}")
else:
    print("\nNo real data available - using synthetic test")
    # Synthetic test
    test_texts = [
        "Just had amazing coffee this morning ☕",
        "Follow for follow! F4F! #followback #follow",
        "Beautiful weather today, perfect for a walk", 
        "URGENT! Free money! Click now! #scam #fake"
    ]
    test_labels = {'binary': [0, 1, 0, 1], 'multiclass': [0, 2, 0, 5]}
    
    synthetic_dataset = CresciDataset(test_texts, test_labels, tokenizer)
    synthetic_loader = DataLoader(synthetic_dataset, batch_size=2)
    
    sample_batch = next(iter(synthetic_loader))
    print(f"Synthetic batch test successful: {sample_batch['input_ids'].shape}")