In [7]:
%load_ext autoreload
%autoreload 2

experiment1 implements a BERT encoder-only architecture

below is the sample multi-head attention mechanism

In [8]:
# Transformer Bot Detection System - Cresci-2017 Dataset
# Implementation of BERT-style encoder-only architecture for Twitter bot detection

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import pandas as pd
import numpy as np
import math
import json
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


FileNotFoundError: [Errno 2] No such file or directory

Attention(Q, K, V) = softmax((QK^T)/sqrt(d_K)) * V

Q, K, V are queries, keys, values derived from the same source in self-attention.
Results in valuesvalues: the weighted sum for each position and head.
Softmax ensures the attention weights sum to 1.
If masking, irrelevant positions (like future tokens or padding) get large negative values in logits, so after softmax attention there is 0.

In [4]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    # (batch, heads, seq_len, head_dim) @ (batch, heads, head_dim, seq_len) --> (batch, heads, seq_len, seq_len)
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
    if mask is not None:
        scaled += mask
    attention = F.softmax(scaled, dim=-1)
    # (batch, heads, seq_len, seq_len) @ (batch, heads, seq_len, head_dim) --> (batch, heads, seq_len, head_dim)
    values = torch.matmul(attention, v)
    return values, attention

Multi-Head Attention Class
Every step mimics the original Transformer:

Project to QKV,
Reshape for multiple heads,
Split into Q, K, V,
Compute attention,
Concatenate heads,
Linear output.

In [None]:
# Updated Multi-Head Attention (cleaned up for production use)
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        
        self.qkv_projection = nn.Linear(d_model, 3 * d_model)
        self.output_projection = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        batch_size, seq_len, d_model = x.size()
        
        # Project to Q, K, V
        qkv = self.qkv_projection(x)
        qkv = qkv.reshape(batch_size, seq_len, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)  # (batch, heads, seq_len, 3*head_dim)
        
        q, k, v = qkv.chunk(3, dim=-1)  # Each: (batch, heads, seq_len, head_dim)
        
        # Scaled dot-product attention
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
            
        attention_weights = F.softmax(scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        
        # Apply attention to values
        attended_values = torch.matmul(attention_weights, v)
        
        # Concatenate heads
        attended_values = attended_values.permute(0, 2, 1, 3).contiguous()
        attended_values = attended_values.reshape(batch_size, seq_len, d_model)
        
        # Final projection
        output = self.output_projection(attended_values)
        return output

In [None]:
# Feed Forward Network
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x))))

# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length=5000):
        super().__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                           (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        return x + self.pe[:x.size(0), :]

# Transformer Encoder Layer
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        # Self-attention with residual connection and layer norm
        attn_output = self.self_attention(x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Feed forward with residual connection and layer norm
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        
        return x

In [None]:
# Simple Tokenizer for Twitter Text
class TwitterTokenizer:
    def __init__(self, vocab_size=30000):
        self.vocab_size = vocab_size
        self.word_to_idx = {}
        self.idx_to_word = {}
        self.word_counts = Counter()
        
        # Special tokens
        self.PAD_TOKEN = '[PAD]'
        self.UNK_TOKEN = '[UNK]'
        self.CLS_TOKEN = '[CLS]'
        self.SEP_TOKEN = '[SEP]'
        self.URL_TOKEN = '[URL]'
        self.MENTION_TOKEN = '[MENTION]'
        
        self.special_tokens = [
            self.PAD_TOKEN, self.UNK_TOKEN, self.CLS_TOKEN, 
            self.SEP_TOKEN, self.URL_TOKEN, self.MENTION_TOKEN
        ]
        
    def preprocess_text(self, text):
        """Preprocess Twitter text"""
        if not isinstance(text, str):
            return ""
            
        # Convert to lowercase
        text = text.lower()
        
        # Replace URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 
                     self.URL_TOKEN, text)
        
        # Replace mentions
        text = re.sub(r'@\w+', self.MENTION_TOKEN, text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize(self, text):
        """Simple whitespace tokenization"""
        processed_text = self.preprocess_text(text)
        return processed_text.split()
    
    def build_vocab(self, texts):
        """Build vocabulary from texts"""
        print("Building vocabulary...")
        
        # Count words
        for text in tqdm(texts):
            tokens = self.tokenize(text)
            self.word_counts.update(tokens)
        
        # Add special tokens first
        for token in self.special_tokens:
            self.word_to_idx[token] = len(self.word_to_idx)
            self.idx_to_word[len(self.idx_to_word)] = token
        
        # Add most frequent words
        most_common = self.word_counts.most_common(self.vocab_size - len(self.special_tokens))
        for word, count in most_common:
            if word not in self.word_to_idx:
                self.word_to_idx[word] = len(self.word_to_idx)
                self.idx_to_word[len(self.idx_to_word)] = word
        
        print(f"Vocabulary size: {len(self.word_to_idx)}")
        return self
    
    def encode(self, text, max_length=256):
        """Convert text to token ids"""
        tokens = self.tokenize(text)
        
        # Add CLS token at beginning
        token_ids = [self.word_to_idx[self.CLS_TOKEN]]
        
        # Add text tokens
        for token in tokens[:max_length-2]:  # Reserve space for CLS and SEP
            token_ids.append(self.word_to_idx.get(token, self.word_to_idx[self.UNK_TOKEN]))
        
        # Add SEP token
        token_ids.append(self.word_to_idx[self.SEP_TOKEN])
        
        # Pad to max_length
        while len(token_ids) < max_length:
            token_ids.append(self.word_to_idx[self.PAD_TOKEN])
            
        return token_ids[:max_length]
    
    def create_attention_mask(self, token_ids):
        """Create attention mask (1 for real tokens, 0 for padding)"""
        return [1 if token_id != self.word_to_idx[self.PAD_TOKEN] else 0 for token_id in token_ids]

In [None]:
# Complete Transformer Model for Bot Detection
class BotDetectionTransformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        
        # Embedding layers
        self.token_embedding = nn.Embedding(config.vocab_size, config.d_model)
        self.position_encoding = PositionalEncoding(config.d_model, config.max_seq_length)
        
        # Transformer encoder layers
        self.encoder_layers = nn.ModuleList([
            TransformerEncoderLayer(
                config.d_model, 
                config.num_heads, 
                config.d_ff, 
                config.dropout
            ) for _ in range(config.num_layers)
        ])
        
        # Classification head
        self.layer_norm = nn.LayerNorm(config.d_model)
        self.dropout = nn.Dropout(config.dropout)
        self.classifier = nn.Linear(config.d_model, config.num_classes)
        
        # Initialize weights
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        """Initialize weights following BERT-style initialization"""
        if isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            nn.init.ones_(module.weight)
            nn.init.zeros_(module.bias)
    
    def create_padding_mask(self, input_ids, pad_token_id=0):
        """Create padding mask for attention"""
        return (input_ids != pad_token_id).unsqueeze(1).unsqueeze(2)
    
    def forward(self, input_ids, attention_mask=None):
        batch_size, seq_len = input_ids.size()
        
        # Token embeddings
        embeddings = self.token_embedding(input_ids)
        embeddings = embeddings * math.sqrt(self.config.d_model)  # Scale embeddings
        
        # Add positional encoding
        embeddings = self.position_encoding(embeddings.transpose(0, 1)).transpose(0, 1)
        
        # Create attention mask if not provided
        if attention_mask is not None:
            # Convert to proper format for attention
            mask = attention_mask.unsqueeze(1).unsqueeze(2).float()
            mask = (1.0 - mask) * -1e9
        else:
            mask = None
        
        # Pass through encoder layers
        hidden_states = embeddings
        for layer in self.encoder_layers:
            hidden_states = layer(hidden_states, mask)
        
        # Get [CLS] token representation for classification
        cls_representation = hidden_states[:, 0, :]  # First token is [CLS]
        
        # Classification
        cls_representation = self.layer_norm(cls_representation)
        cls_representation = self.dropout(cls_representation)
        logits = self.classifier(cls_representation)
        
        return logits

# Test the model
print("Testing model architecture...")
test_model = BotDetectionTransformer(config).to(config.device)
test_input = torch.randint(0, config.vocab_size, (2, config.max_seq_length)).to(config.device)
test_mask = torch.ones_like(test_input).to(config.device)

with torch.no_grad():
    test_output = test_model(test_input, test_mask)
    print(f"Model output shape: {test_output.shape}")
    print(f"Total parameters: {sum(p.numel() for p in test_model.parameters()):,}")

del test_model, test_input, test_mask  # Clean up memory

In [None]:
# Dataset Class for Cresci-2017
class TwitterBotDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize
        token_ids = self.tokenizer.encode(text, self.max_length)
        attention_mask = self.tokenizer.create_attention_mask(token_ids)
        
        return {
            'input_ids': torch.tensor(token_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Data Loading and Preprocessing Functions
def load_cresci_data_demo():
    \"\"\"
    Demo function to create synthetic data in Cresci-2017 format.
    Replace this with actual Cresci-2017 data loading.
    \"\"\"
    print("Creating demo dataset (replace with actual Cresci-2017 data loading)")
    
    # Synthetic bot tweets (common bot patterns)
    bot_tweets = [
        "Follow me for amazing deals! #sponsored #ad #promotion",
        "Click here for free money! Link in bio #scam #fake",
        "RT @randomuser: Buy now! Limited time offer!!!",
        "Amazing product! Everyone should buy this! #ad #promotion",
        "Free followers! Click the link! #followers #fake",
        "Best deals ever! Don't miss out! RT if you agree!",
        "Automatic retweet service available! DM for details",
        "Buy cheap followers and likes! Fast delivery guaranteed!",
        "Promoting amazing products! Check my timeline! #ad",
        "RT @sponsor: Limited time sale! Buy now or regret later!"
    ] * 100  # Repeat to create more samples
    
    # Synthetic human tweets (more natural patterns)
    human_tweets = [
        "Just had an amazing coffee at my local cafe ☕",
        "Working from home today, feeling productive!",
        "Can't wait for the weekend! Anyone have fun plans?",
        "Just finished reading a great book, highly recommend it",
        "Weather is beautiful today, perfect for a walk",
        "Cooking dinner for my family tonight, trying new recipe",
        "Great conversation with friends over lunch today",
        "Learning something new every day, love continuous growth",
        "Watching a documentary about ocean life, so fascinating",
        "Planning my next vacation, so many places to explore"
    ] * 100  # Repeat to create more samples
    
    # Create labels (0 = human, 1 = bot)
    texts = human_tweets + bot_tweets
    labels = [0] * len(human_tweets) + [1] * len(bot_tweets)
    
    # Create DataFrame
    df = pd.DataFrame({
        'text': texts,
        'label': labels,
        'account_type': ['human' if l == 0 else 'bot' for l in labels]
    })
    
    print(f"Created dataset with {len(df)} samples")
    print(f"Label distribution: {df['label'].value_counts().to_dict()}")
    
    return df

def create_account_level_splits(df, test_size=0.2, val_size=0.1, random_state=42):
    \"\"\"
    Create account-level splits to prevent data leakage.
    In real Cresci-2017, you would group by account_id and split at account level.
    \"\"\"
    print("Creating data splits...")
    
    # For demo, we'll simulate account-level splitting
    # In real implementation, group by account_id first
    unique_indices = df.index.tolist()
    
    # Split indices (simulating account-level split)
    train_idx, temp_idx = train_test_split(
        unique_indices, test_size=test_size + val_size, 
        random_state=random_state, stratify=df.loc[unique_indices, 'label']
    )
    
    val_idx, test_idx = train_test_split(
        temp_idx, test_size=test_size / (test_size + val_size),
        random_state=random_state, stratify=df.loc[temp_idx, 'label']
    )
    
    train_df = df.loc[train_idx].reset_index(drop=True)
    val_df = df.loc[val_idx].reset_index(drop=True)
    test_df = df.loc[test_idx].reset_index(drop=True)
    
    print(f"Train size: {len(train_df)}")
    print(f"Validation size: {len(val_df)}")
    print(f"Test size: {len(test_df)}")
    
    return train_df, val_df, test_df

# Load and prepare data
print("Loading Cresci-2017 dataset...")
df = load_cresci_data_demo()

# Create splits
train_df, val_df, test_df = create_account_level_splits(df, config.test_size, config.val_size)

# Build tokenizer on training data
tokenizer = TwitterTokenizer(vocab_size=config.vocab_size)
tokenizer.build_vocab(train_df['text'].tolist())

print(f"\\nVocabulary statistics:")
print(f"- Total unique words in training: {len(tokenizer.word_counts)}")
print(f"- Vocabulary size: {len(tokenizer.word_to_idx)}")
print(f"- Most common words: {list(tokenizer.word_counts.most_common(10))}")

# Create datasets
train_dataset = TwitterBotDataset(
    train_df['text'].tolist(), 
    train_df['label'].tolist(), 
    tokenizer, 
    config.max_seq_length
)

val_dataset = TwitterBotDataset(
    val_df['text'].tolist(), 
    val_df['label'].tolist(), 
    tokenizer, 
    config.max_seq_length
)

test_dataset = TwitterBotDataset(
    test_df['text'].tolist(), 
    test_df['label'].tolist(), 
    tokenizer, 
    config.max_seq_length
)

# Create data loaders
train_loader = DataLoader(
    train_dataset, 
    batch_size=config.batch_size, 
    shuffle=True,
    num_workers=0  # Set to 0 for compatibility
)

val_loader = DataLoader(
    val_dataset, 
    batch_size=config.batch_size, 
    shuffle=False,
    num_workers=0
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=config.batch_size, 
    shuffle=False,
    num_workers=0
)

print(f"\\nData loaders created:")
print(f"- Train batches: {len(train_loader)}")
print(f"- Val batches: {len(val_loader)}")
print(f"- Test batches: {len(test_loader)}")

In [None]:
# Configuration and Hyperparameters
class Config:
    # Model Architecture
    d_model = 512          # Embedding dimension
    num_layers = 10         # Number of transformer layers
    num_heads = 12          # Number of attention heads
    d_ff = 2048           # Feed forward hidden dimension
    dropout = 0.1         # Dropout rate
    max_seq_length = 256  # Maximum sequence length
    vocab_size = 30000    # Vocabulary size
    
    # Training Parameters
    batch_size = 32
    learning_rate = 2e-5
    warmup_steps = 500
    max_epochs = 10
    gradient_clip_norm = 1.0
    
    # Data
    num_classes = 2       # Binary: bot vs human
    test_size = 0.2
    val_size = 0.1
    
    # Device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config = Config()
print(f"Using device: {config.device}")
print(f"Model configuration: d_model={config.d_model}, layers={config.num_layers}, heads={config.num_heads}")

example:

In [6]:
# Model/inputs setup
input_dim = 1024   # Input feature size per token
d_model = 512      # Embedding/model size (must divide num_heads)
num_heads = 8
batch_size = 30
sequence_length = 5

# Create random input
x = torch.randn((batch_size, sequence_length, input_dim))

# Instantiate MultiheadAttention class and run
model = MultiheadAttention(input_dim, d_model, num_heads)
output = model.forward(x)
output

x.size(): torch.Size([30, 5, 1024])
qkv.size(): torch.Size([30, 5, 1536])
qkv.size(): torch.Size([30, 5, 8, 192])
qkv.size(): torch.Size([30, 8, 5, 192])
q size: torch.Size([30, 8, 5, 64]), k size: torch.Size([30, 8, 5, 64]), v size: torch.Size([30, 8, 5, 64])
values.size(): torch.Size([30, 8, 5, 64]), attention.size: torch.Size([30, 8, 5, 5])
values.size(): torch.Size([30, 5, 512])
out.size(): torch.Size([30, 5, 512])


tensor([[[ 0.0041, -0.1982,  0.0871,  ...,  0.1250, -0.1515, -0.0741],
         [ 0.0665, -0.0146, -0.2594,  ...,  0.1805,  0.1107,  0.0254],
         [-0.2314, -0.1776,  0.3157,  ..., -0.1198,  0.1174,  0.1548],
         [-0.0467, -0.0217, -0.1965,  ...,  0.0164,  0.0309,  0.3131],
         [-0.3640,  0.2524,  0.1880,  ..., -0.3888,  0.2219, -0.2219]],

        [[ 0.0195, -0.1414,  0.0139,  ..., -0.1696,  0.1596,  0.4023],
         [ 0.1171,  0.1215,  0.0429,  ...,  0.2675,  0.1349, -0.2392],
         [-0.0793,  0.0033,  0.2842,  ..., -0.0579, -0.1722, -0.3303],
         [ 0.0121, -0.1976, -0.3343,  ...,  0.0189,  0.2305, -0.1511],
         [-0.2417, -0.2875,  0.2639,  ..., -0.2358,  0.2205,  0.0966]],

        [[ 0.0671, -0.0536, -0.1049,  ...,  0.1113, -0.0619,  0.3027],
         [ 0.1294,  0.0100,  0.0183,  ...,  0.0975,  0.0381, -0.1794],
         [-0.0598, -0.0061, -0.0928,  ..., -0.3921, -0.2349,  0.3383],
         [-0.1280,  0.2926,  0.0364,  ..., -0.0779, -0.2042,  0.2748],
  