# Experiment 2: Subword Tokenization Implementation
Focus on implementing robust Twitter-aware subword tokenization

In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## Optimized Configuration

In [2]:
class OptimalBotConfig:
    d_model = 512           # Balanced performance/speed
    num_layers = 9          # Captures bot patterns effectively
    num_heads = 12          # Rich attention diversity
    d_ff = 2048            # 4x d_model ratio
    max_seq_length = 128   # Twitter-optimized
    dropout = 0.15         # Higher for overfitting prevention
    num_classes = 2
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config = OptimalBotConfig()
print(f"Device: {config.device}")
print(f"Model: {config.d_model}d, {config.num_layers}L, {config.num_heads}H")

Device: cpu
Model: 512d, 9L, 12H


## Twitter-Aware Subword Tokenization

In [3]:
class TwitterBotTokenizer:
    def __init__(self, model_name="cardiffnlp/twitter-roberta-base"):
        """Initialize with Twitter-aware RoBERTa tokenizer"""
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.vocab_size = len(self.tokenizer)
        
        print(f"Loaded tokenizer: {model_name}")
        print(f"Vocabulary size: {self.vocab_size:,}")
        print(f"Special tokens: {self.tokenizer.special_tokens_map}")
    
    def analyze_text(self, text):
        """Analyze tokenization of a text sample"""
        # Raw tokenization
        tokens = self.tokenizer.tokenize(text)
        token_ids = self.tokenizer.encode(text, add_special_tokens=True)
        
        # With length constraints
        encoded = self.tokenizer(
            text,
            max_length=config.max_seq_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        
        return {
            'original_text': text,
            'tokens': tokens,
            'token_ids': token_ids,
            'encoded_ids': encoded['input_ids'].squeeze().tolist(),
            'attention_mask': encoded['attention_mask'].squeeze().tolist(),
            'actual_length': len(token_ids),
            'padded_length': len(encoded['input_ids'].squeeze())
        }
    
    def batch_encode(self, texts, max_length=None):
        """Encode a batch of texts"""
        max_length = max_length or config.max_seq_length
        
        encoded = self.tokenizer(
            texts,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoded['input_ids'],
            'attention_mask': encoded['attention_mask']
        }

# Initialize tokenizer
bot_tokenizer = TwitterBotTokenizer()

Loaded tokenizer: cardiffnlp/twitter-roberta-base
Vocabulary size: 50,265
Special tokens: {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}


## Tokenization Analysis on Bot vs Human Examples

In [4]:
# Test examples from different categories
test_examples = [
    # Genuine human tweets
    "Just had an amazing coffee at my local cafe ☕ Perfect start to Monday!",
    "Working from home today, feeling productive and grateful for flexibility",
    
    # Bot-like patterns  
    "Follow me for amazing deals! #sponsored #ad #promotion #sale #discount",
    "🚨 URGENT: Click here for FREE MONEY! Link in bio! #scam #fake #followers",
    
    # Edge cases
    "RT @user123: Buy now!!! Limited time offer!!! Don't miss out!!!",
    "@mention1 @mention2 @mention3 check this out http://bit.ly/suspicious"
]

print("=" * 80)
print("TOKENIZATION ANALYSIS")
print("=" * 80)

for i, text in enumerate(test_examples):
    analysis = bot_tokenizer.analyze_text(text)
    
    print(f"\nExample {i+1}:")
    print(f"Text: {analysis['original_text'][:60]}{'...' if len(analysis['original_text']) > 60 else ''}")
    print(f"Tokens ({len(analysis['tokens'])}): {analysis['tokens'][:8]}{'...' if len(analysis['tokens']) > 8 else ''}")
    print(f"Actual length: {analysis['actual_length']} tokens")
    print(f"Padded to: {analysis['padded_length']} tokens")
    print(f"Efficiency: {analysis['actual_length']/analysis['padded_length']*100:.1f}% (non-padding)")

TOKENIZATION ANALYSIS

Example 1:
Text: Just had an amazing coffee at my local cafe ☕ Perfect start ...
Tokens (16): ['Just', 'Ġhad', 'Ġan', 'Ġamazing', 'Ġcoffee', 'Ġat', 'Ġmy', 'Ġlocal']...
Actual length: 18 tokens
Padded to: 128 tokens
Efficiency: 14.1% (non-padding)

Example 2:
Text: Working from home today, feeling productive and grateful for...
Tokens (11): ['Working', 'Ġfrom', 'Ġhome', 'Ġtoday', ',', 'Ġfeeling', 'Ġproductive', 'Ġand']...
Actual length: 13 tokens
Padded to: 128 tokens
Efficiency: 10.2% (non-padding)

Example 3:
Text: Follow me for amazing deals! #sponsored #ad #promotion #sale...
Tokens (18): ['Follow', 'Ġme', 'Ġfor', 'Ġamazing', 'Ġdeals', '!', 'Ġ#', 'sponsored']...
Actual length: 20 tokens
Padded to: 128 tokens
Efficiency: 15.6% (non-padding)

Example 4:
Text: 🚨 URGENT: Click here for FREE MONEY! Link in bio! #scam #fak...
Tokens (26): ['ðŁ', 'ļ', '¨', 'ĠUR', 'G', 'ENT', ':', 'ĠClick']...
Actual length: 28 tokens
Padded to: 128 tokens
Efficiency: 21.9% (non-paddi

## Vocabulary and Token Distribution Analysis

In [5]:
def analyze_tokenization_efficiency(texts, tokenizer):
    """Analyze how efficiently our tokenizer handles the text corpus"""
    stats = {
        'lengths': [],
        'oov_counts': [],
        'special_token_usage': {'mentions': 0, 'urls': 0, 'hashtags': 0}
    }
    
    for text in texts[:100]:  # Sample for analysis
        tokens = tokenizer.tokenizer.tokenize(text)
        token_ids = tokenizer.tokenizer.encode(text, add_special_tokens=True)
        
        stats['lengths'].append(len(token_ids))
        
        # Count OOV (unknown) tokens
        oov_count = sum(1 for tid in token_ids if tid == tokenizer.tokenizer.unk_token_id)
        stats['oov_counts'].append(oov_count)
        
        # Check for Twitter-specific patterns
        text_lower = text.lower()
        if '@' in text: stats['special_token_usage']['mentions'] += 1
        if 'http' in text: stats['special_token_usage']['urls'] += 1
        if '#' in text: stats['special_token_usage']['hashtags'] += 1
    
    # Calculate statistics
    lengths = np.array(stats['lengths'])
    oov_counts = np.array(stats['oov_counts'])
    
    print(f"\nTOKENIZATION EFFICIENCY ANALYSIS")
    print(f"{'='*50}")
    print(f"Sample size: {len(texts[:100])} texts")
    print(f"\nLength Statistics:")
    print(f"  Mean length: {lengths.mean():.1f} tokens")
    print(f"  Median length: {np.median(lengths):.1f} tokens")
    print(f"  95th percentile: {np.percentile(lengths, 95):.1f} tokens")
    print(f"  Max length: {lengths.max()} tokens")
    print(f"  Texts > 128 tokens: {(lengths > 128).sum()}/{len(lengths)} ({(lengths > 128).mean()*100:.1f}%)")
    
    print(f"\nOOV (Unknown) Tokens:")
    print(f"  Mean OOV per text: {oov_counts.mean():.2f}")
    print(f"  Texts with OOV: {(oov_counts > 0).sum()}/{len(oov_counts)} ({(oov_counts > 0).mean()*100:.1f}%)")
    
    print(f"\nTwitter-specific Elements:")
    for element, count in stats['special_token_usage'].items():
        print(f"  Texts with {element}: {count}/{len(texts[:100])} ({count/len(texts[:100])*100:.1f}%)")
    
    return stats

# For demo, create some sample texts
sample_texts = [
    "Follow for follow! #F4F #follow #followback",
    "Just finished a great workout! Feeling energized 💪",
    "RT @sponsor: AMAZING DEAL! Click now! http://bit.ly/deal",
    "Coffee with friends this morning ☕ Perfect way to start the day",
    "🚨 URGENT: Free money! DM me now! #money #free #cash"
] * 20  # Repeat to get 100 samples

efficiency_stats = analyze_tokenization_efficiency(sample_texts, bot_tokenizer)


TOKENIZATION EFFICIENCY ANALYSIS
Sample size: 100 texts

Length Statistics:
  Mean length: 18.2 tokens
  Median length: 17.0 tokens
  95th percentile: 23.0 tokens
  Max length: 23 tokens
  Texts > 128 tokens: 0/100 (0.0%)

OOV (Unknown) Tokens:
  Mean OOV per text: 0.00
  Texts with OOV: 0/100 (0.0%)

Twitter-specific Elements:
  Texts with mentions: 20/100 (20.0%)
  Texts with urls: 20/100 (20.0%)
  Texts with hashtags: 40/100 (40.0%)


## Integration with PyTorch Dataset

In [6]:
from torch.utils.data import Dataset, DataLoader

class TwitterBotDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=None):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length or config.max_seq_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Use the optimized tokenizer
        encoding = self.tokenizer.tokenizer(
            text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Test dataset creation
test_texts = [
    "Genuine human tweet about daily life",
    "Follow me for deals! #sponsored #ad",
    "Another normal tweet about weather",
    "URGENT! Free money! Click link!"
]
test_labels = [0, 1, 0, 1]  # 0=human, 1=bot

dataset = TwitterBotDataset(test_texts, test_labels, bot_tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Test a batch
sample_batch = next(iter(dataloader))
print(f"\nSample batch shapes:")
print(f"  Input IDs: {sample_batch['input_ids'].shape}")
print(f"  Attention mask: {sample_batch['attention_mask'].shape}")
print(f"  Labels: {sample_batch['labels'].shape}")
print(f"\nFirst text tokens: {sample_batch['input_ids'][0][:10].tolist()}")
print(f"First attention mask: {sample_batch['attention_mask'][0][:10].tolist()}")


Sample batch shapes:
  Input IDs: torch.Size([2, 128])
  Attention mask: torch.Size([2, 128])
  Labels: torch.Size([2])

First text tokens: [0, 21518, 2340, 3545, 59, 1650, 2, 1, 1, 1]
First attention mask: [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
