# model architecture
optimized bot detection transformer

In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


## configuration

In [None]:
class ModelConfig:
    d_model = 512 # good amount
    num_layers = 9 # captures bot patterns effectively
    num_heads = 12 # dense attention diversity
    d_ff = d_model * 4
    vocab_size = 50000     # Subword tokenization
    max_seq_length = 128 # twitter optimized
    dropout = 0.15 # Higher for overfitting prevention
    num_classes = 2 # bot or person
    # will use ec2 for compute, might have to change into gpu
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config = ModelConfig()
print(f"Device: {config.device}")
print(f"Model: {config.d_model}d, {config.num_layers}L, {config.num_heads}H")

Device: cpu
Model: 512d, 9L, 12H


## option 1: subword tokenization

In [None]:
# pre-trained twitter-aware tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base")

# test tokenization
test_text = "Follow me for amazing deals! #sponsored @user http://bit.ly/spam"
tokens = tokenizer.tokenize(test_text)
token_ids = tokenizer.encode(test_text, max_length=config.max_seq_length, truncation=True, padding='max_length')

print(f"Original: {test_text}")
print(f"Tokens: {tokens}")
print(f"IDs shape: {len(token_ids)}")

Original: Follow me for amazing deals! #sponsored @user http://bit.ly/spam
Tokens: ['Follow', 'Ġme', 'Ġfor', 'Ġamazing', 'Ġdeals', '!', 'Ġ#', 'sponsored', 'Ġ@', 'user', 'Ġhttp', '://', 'bit', '.', 'ly', '/', 'sp', 'am']
[0, 18622, 162, 13, 2770, 2656, 328, 849, 16032, 787, 12105, 2054, 640, 5881, 4, 352, 73, 4182, 424, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
IDs shape: 128


## option 2: multi-task architecture

In [8]:
class BotDetectionHead(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.bot_classifier = nn.Linear(d_model, 2)  # bot/human
        self.bot_type_classifier = nn.Linear(d_model, 6)  # bot categories
        self.confidence_estimator = nn.Linear(d_model, 1)  # uncertainty
        
    def forward(self, cls_representation):
        bot_logits = self.bot_classifier(cls_representation)
        bot_type_logits = self.bot_type_classifier(cls_representation)
        confidence = torch.sigmoid(self.confidence_estimator(cls_representation))
        
        return {
            'bot_logits': bot_logits,
            'bot_type_logits': bot_type_logits,
            'confidence': confidence
        }

# Test the head
test_head = BotDetectionHead(config.d_model)
test_input = torch.randn(2, config.d_model)
outputs = test_head(test_input)
print(f"Bot logits shape: {outputs['bot_logits'].shape}")
print(f"Bot logits: {outputs['bot_logits']}")
print(f"Bot type logits shape: {outputs['bot_type_logits'].shape}")
print(f"Confidence shape: {outputs['confidence'].shape}")
print(f"Confidence: {outputs['confidence']}")

Bot logits shape: torch.Size([2, 2])
Bot logits: tensor([[ 1.2711, -0.2371],
        [-0.6507, -0.8039]], grad_fn=<AddmmBackward0>)
Bot type logits shape: torch.Size([2, 6])
Confidence shape: torch.Size([2, 1])
Confidence: tensor([[0.2305],
        [0.5401]], grad_fn=<SigmoidBackward0>)


## Parameter Count Comparison

In [None]:
def calculate_transformer_params(d_model, num_layers, num_heads, vocab_size, max_seq_length):
    # Token embeddings
    token_emb = vocab_size * d_model
    
    # Position embeddings
    pos_emb = max_seq_length * d_model
    
    # Per transformer layer
    attention_params = 4 * d_model * d_model  # Q,K,V,O projections
    ffn_params = d_model * (d_model * 4) * 2  # Two linear layers
    layer_norm_params = d_model * 2 * 2  # Two layer norms per layer
    per_layer = attention_params + ffn_params + layer_norm_params
    
    total_layers = per_layer * num_layers
    
    # Classification head (simplified)
    classifier = d_model * 2
    
    total = token_emb + pos_emb + total_layers + classifier
    return total

# Compare configurations
old_params = calculate_transformer_params(768, 6, 8, 30000, 256)
new_params = calculate_transformer_params(512, 9, 12, 50000, 128)

print(f"Old config (768d, 6L): {old_params:,} parameters")
print(f"New config (512d, 9L): {new_params:,} parameters")
print(f"Parameter ratio: {new_params/old_params:.2f}x")