## Test 1 - Masking Layer

In [1]:
import test_layer1

In [2]:
# Ch·∫°y to√†n b·ªô tests c·ªßa test_layer1.py b·∫±ng pytest
import pytest

result = pytest.main(["test_layer1.py", "-v"])
print("K·∫øt qu·∫£ pytest:", result)


platform win32 -- Python 3.13.5, pytest-9.0.2, pluggy-1.6.0 -- c:\Python313\python.exe
cachedir: .pytest_cache
rootdir: c:\IE403\IE403_DoAnCuoiKy\Smishing\misspell_detection\tests
plugins: anyio-4.12.0
[1mcollecting ... [0mcollected 39 items

test_layer1.py::TestURLMasking::test_standard_url_with_https [32mPASSED[0m[32m      [  2%][0m
test_layer1.py::TestURLMasking::test_standard_url_with_www [32mPASSED[0m[32m        [  5%][0m
test_layer1.py::TestURLMasking::test_url_shortener_bitly [32mPASSED[0m[32m          [  7%][0m
test_layer1.py::TestURLMasking::test_aggressive_url_with_spaces [32mPASSED[0m[32m   [ 10%][0m
test_layer1.py::TestURLMasking::test_spam_tld_icu [32mPASSED[0m[32m                 [ 12%][0m
test_layer1.py::TestURLMasking::test_spam_tld_vip [32mPASSED[0m[32m                 [ 15%][0m
test_layer1.py::TestURLMasking::test_multiple_urls [32mPASSED[0m[32m                [ 17%][0m
test_layer1.py::TestZaloTelegramMasking::test_zalo_link [32mPASSED[

In [3]:
import pandas as pd
import sys
from pathlib import Path

# Setup paths
ROOT_DIR = Path.cwd().parent.parent.parent  # IE403_DoAnCuoiKy/
sys.path.insert(0, str(ROOT_DIR))

from Smishing.preprocessing.layer1_masking import AggressiveMasker
from Smishing.data_loader import load_dataset

# Load dataset
DATA_PATH = ROOT_DIR / "data" / "dataset.csv"
OUTPUT_PATH = Path.cwd() / "layer1_masking_results.csv"

print(f"üìÇ Loading dataset from: {DATA_PATH}")
df = load_dataset(DATA_PATH)
print(f"‚úÖ Loaded {len(df):,} rows")

# Initialize masker
masker = AggressiveMasker()

# Process all rows
print(f"\nüîÑ Processing {len(df):,} rows...")
results = []

for idx, row in df.iterrows():
    content = str(row.get("content", ""))
    label = row.get("label", "")
    
    try:
        masked_text, metadata = masker.mask(content)
        counts = masker.get_entity_counts(metadata)
    except Exception as e:
        masked_text = f"ERROR: {e}"
        metadata = {}
        counts = {}
    
    result = {
        "index": idx,
        "label": label,
        "original_content": content,
        "masked_content": masked_text,
        "url_count": counts.get("url", 0) + counts.get("zalo", 0) + counts.get("telegram", 0),
        "phone_count": counts.get("hotline", 0) + counts.get("landline", 0) + 
                      counts.get("mobile", 0) + counts.get("shortcode", 0),
        "money_count": counts.get("money", 0),
        "code_count": counts.get("code", 0),
        "email_count": counts.get("email", 0),
        "datetime_count": counts.get("datetime", 0),
        "raw_metadata": str(metadata),
    }
    results.append(result)
    
    if (idx + 1) % 500 == 0:
        print(f"   Processed {idx + 1:,} / {len(df):,} rows...")

# Save results
result_df = pd.DataFrame(results)
result_df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8-sig")

print(f"\n‚úÖ Results saved to: {OUTPUT_PATH}")
print(f"   Total rows: {len(result_df):,}")

üìÇ Loading dataset from: c:\IE403\IE403_DoAnCuoiKy\data\dataset.csv
‚úÖ Loaded 2,603 rows from dataset.csv (standard parser)
‚úÖ Loaded 2,603 rows

üîÑ Processing 2,603 rows...
   Processed 500 / 2,603 rows...
   Processed 1,000 / 2,603 rows...
   Processed 1,500 / 2,603 rows...
   Processed 2,000 / 2,603 rows...
   Processed 2,500 / 2,603 rows...

‚úÖ Results saved to: c:\IE403\IE403_DoAnCuoiKy\Smishing\misspell_detection\tests\layer1_masking_results.csv
   Total rows: 2,603


In [4]:
# Summary statistics
print("\nüìä SUMMARY STATISTICS:")
print("-" * 50)
print(f"   URLs detected:      {result_df['url_count'].sum():,}")
print(f"   Phones detected:    {result_df['phone_count'].sum():,}")
print(f"   Money detected:     {result_df['money_count'].sum():,}")
print(f"   Codes detected:     {result_df['code_count'].sum():,}")
print(f"   Emails detected:    {result_df['email_count'].sum():,}")
print(f"   DateTimes detected: {result_df['datetime_count'].sum():,}")

# Rows with at least one entity
has_entity = result_df[['url_count', 'phone_count', 'money_count', 'code_count']].sum(axis=1) > 0
print(f"\n   Rows with entities: {has_entity.sum():,} / {len(result_df):,} ({has_entity.sum()/len(result_df)*100:.1f}%)")


üìä SUMMARY STATISTICS:
--------------------------------------------------
   URLs detected:      1,192
   Phones detected:    2,157
   Money detected:     2,748
   Codes detected:     1,140
   Emails detected:    7
   DateTimes detected: 2,742

   Rows with entities: 2,285 / 2,603 (87.8%)


In [5]:
# Show sample results
print("\nüìã SAMPLE RESULTS (first 10 rows with changes):")
print("=" * 80)

# Filter rows that have changes
changed_rows = result_df[result_df['original_content'] != result_df['masked_content']]

for _, row in changed_rows.head(10).iterrows():
    print(f"\n[{row['index']}] Label: {row['label']}")
    print(f"   Original: {row['original_content'][:100]}...")
    print(f"   Masked:   {row['masked_content'][:100]}...")
    print(f"   Counts:   URL={row['url_count']}, Phone={row['phone_count']}, Money={row['money_count']}, Code={row['code_count']}")


üìã SAMPLE RESULTS (first 10 rows with changes):

[0] Label: 1
   Original: [TRUNG T√ÇM PH√íNG CH·ªêNG GIAN L·∫¨N NG√ÇN H√ÄNG] √îng/B√† Nguy·ªÖn VƒÉn Minh Tr∆∞·ªõc 17h ng√†y h√¥m nay kh√¥ng thanh...
   Masked:   [TRUNG T√ÇM PH√íNG CH·ªêNG GIAN L·∫¨N NG√ÇN H√ÄNG] √îng/B√† Nguy·ªÖn VƒÉn Minh Tr∆∞·ªõc <TIME> ng√†y h√¥m nay kh√¥ng th...
   Counts:   URL=0, Phone=0, Money=2, Code=0

[1] Label: 1
   Original: [TB] Tien ich Loi nhan thoai cua Viettel: Quy khach co loi nhan tu TB 0848836182 vao luc 08:09 27/03...
   Masked:   [TB] Tien ich Loi nhan thoai cua Viettel: Quy khach co loi nhan tu TB <PHONE> vao luc <TIME> <TIME>....
   Counts:   URL=0, Phone=2, Money=1, Code=0

[2] Label: 1
   Original: Western Union TB: Vietcombank: 0071000986547. Tr·∫ßn Th·ªã Lan. Ref +19.56 USD. Nh·∫≠n 500.000 VND. Ngay 0...
   Masked:   Western Union TB: Vietcombank: 0071000986547. Tr·∫ßn Th·ªã Lan. Ref +<MONEY>. Nh·∫≠n <MONEY>. Ngay <TIME>....
   Counts:   URL=1, Phone=0, Money=2, Code=0

[3] Label: 1
   Ori

## Test Layer 2

In [6]:
import test_layer2

‚úì Both dicts loaded: 78,258 words (full), 65,863 words (shadow) from c:\IE403\IE403_DoAnCuoiKy\Smishing\dicts\words.txt


In [7]:
# Ch·∫°y to√†n b·ªô tests c·ªßa test_layer1.py b·∫±ng pytest
import pytest

result = pytest.main(["test_layer2.py", "-v"])
print("K·∫øt qu·∫£ pytest:", result)

platform win32 -- Python 3.13.5, pytest-9.0.2, pluggy-1.6.0 -- c:\Python313\python.exe
cachedir: .pytest_cache
rootdir: c:\IE403\IE403_DoAnCuoiKy\Smishing\misspell_detection\tests
plugins: anyio-4.12.0
[1mcollecting ... [0mcollected 46 items

test_layer2.py::TestLeetspeak::test_leet_digit_0_to_o [32mPASSED[0m[32m             [  2%][0m
test_layer2.py::TestLeetspeak::test_leet_digit_1_to_i [32mPASSED[0m[32m             [  4%][0m
test_layer2.py::TestLeetspeak::test_leet_digit_3_to_e [32mPASSED[0m[32m             [  6%][0m
test_layer2.py::TestLeetspeak::test_leet_digit_4_to_a [32mPASSED[0m[32m             [  8%][0m
test_layer2.py::TestLeetspeak::test_leet_symbol_exclamation_to_i [32mPASSED[0m[32m  [ 10%][0m
test_layer2.py::TestLeetspeak::test_leet_symbol_at_to_a [32mPASSED[0m[32m           [ 13%][0m
test_layer2.py::TestLeetspeak::test_leet_symbol_dollar_to_s [32mPASSED[0m[32m       [ 15%][0m
test_layer2.py::TestLeetspeak::test_leet_char_j_to_i [32mPASSED[0m

In [8]:
# ============================================================
# LAYER 2: APPLY NORMALIZATION TO DATASET
# ============================================================

import pandas as pd
import sys
import json
from pathlib import Path

# Setup paths
ROOT_DIR = Path.cwd().parent.parent.parent  # IE403_DoAnCuoiKy/
sys.path.insert(0, str(ROOT_DIR))

from Smishing.preprocessing.layer1_masking import AggressiveMasker
from Smishing.misspell_detection.layer2_normalization import TextNormalizer
from Smishing.data_loader import load_dataset

# Load dataset
DATA_PATH = ROOT_DIR / "data" / "dataset.csv"
OUTPUT_PATH = Path.cwd() / "layer2_normalization_results.csv"

print(f"üìÇ Loading dataset from: {DATA_PATH}")
df = load_dataset(DATA_PATH)
print(f"‚úÖ Loaded {len(df):,} rows")

# Initialize processors
masker = AggressiveMasker()
normalizer = TextNormalizer()

# Process all rows with FULL PIPELINE: Layer 1 ‚Üí Layer 2
print(f"\nüîÑ Processing {len(df):,} rows through Layer 1 + Layer 2...")
results = []

for idx, row in df.iterrows():
    content = str(row.get("content", ""))
    label = row.get("label", "")
    
    try:
        # Layer 1: Masking
        masked_text, mask_metadata = masker.mask(content)
        mask_counts = masker.get_entity_counts(mask_metadata)
        
        # Layer 2: Normalization (on masked text)
        norm_result = normalizer.normalize(masked_text)
        
    except Exception as e:
        masked_text = f"ERROR: {e}"
        mask_counts = {}
        norm_result = None
    
    result = {
        "index": idx,
        "label": label,
        "original_content": content,
        "layer1_masked": masked_text,
        "layer2_normalized": norm_result.normalized_text if norm_result else "",
        "layer2_tokens": str(norm_result.tokens) if norm_result else "[]",
        "token_count": len(norm_result.tokens) if norm_result else 0,
        "leet_count": norm_result.leet_count if norm_result else 0,
        "teencode_count": norm_result.teencode_count if norm_result else 0,
        "visual_leet_count": norm_result.visual_leet_count if norm_result else 0,
        "symbol_leet_count": norm_result.symbol_leet_count if norm_result else 0,
        "validated_leet_count": norm_result.validated_leet_count if norm_result else 0,
        "weighted_leet_score": norm_result.weighted_leet_score if norm_result else 0.0,
        "separator_count": norm_result.separator_count if norm_result else 0,
        # Layer 1 counts
        "url_count": mask_counts.get("url", 0) + mask_counts.get("zalo", 0) + mask_counts.get("telegram", 0),
        "phone_count": mask_counts.get("hotline", 0) + mask_counts.get("landline", 0) + 
                      mask_counts.get("mobile", 0) + mask_counts.get("shortcode", 0),
        "money_count": mask_counts.get("money", 0),
        "code_count": mask_counts.get("code", 0),
    }
    results.append(result)
    
    if (idx + 1) % 500 == 0:
        print(f"   Processed {idx + 1:,} / {len(df):,} rows...")

# Save results
result_df = pd.DataFrame(results)
result_df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8-sig")

print(f"\n‚úÖ Results saved to: {OUTPUT_PATH}")
print(f"   Total rows: {len(result_df):,}")

üìÇ Loading dataset from: c:\IE403\IE403_DoAnCuoiKy\data\dataset.csv
‚úÖ Loaded 2,603 rows from dataset.csv (standard parser)
‚úÖ Loaded 2,603 rows
‚úì Both dicts loaded: 78,258 words (full), 65,863 words (shadow) from c:\IE403\IE403_DoAnCuoiKy\Smishing\dicts\words.txt
‚úì Dictionary loaded: 78,258 words (full), 65,863 words (shadow)

üîÑ Processing 2,603 rows through Layer 1 + Layer 2...
   Processed 500 / 2,603 rows...
   Processed 1,000 / 2,603 rows...
   Processed 1,500 / 2,603 rows...
   Processed 2,000 / 2,603 rows...
   Processed 2,500 / 2,603 rows...

‚úÖ Results saved to: c:\IE403\IE403_DoAnCuoiKy\Smishing\misspell_detection\tests\layer2_normalization_results.csv
   Total rows: 2,603


In [9]:
# ============================================================
# LAYER 2: SUMMARY STATISTICS
# ============================================================

print("\nüìä LAYER 2 SUMMARY STATISTICS:")
print("=" * 60)

# Basic counts
print(f"\nüìù TOKEN STATISTICS:")
print(f"   Total tokens:      {result_df['token_count'].sum():,}")
print(f"   Avg tokens/row:    {result_df['token_count'].mean():.1f}")
print(f"   Max tokens/row:    {result_df['token_count'].max()}")

print(f"\nüî§ LEET DETECTION:")
print(f"   Total leet chars:  {result_df['leet_count'].sum():,}")
print(f"   Rows with leet:    {(result_df['leet_count'] > 0).sum():,} ({(result_df['leet_count'] > 0).sum()/len(result_df)*100:.1f}%)")
print(f"   Avg leet/row:      {result_df['leet_count'].mean():.2f}")
print(f"   Teencode count:    {result_df['teencode_count'].sum():,}")
print(f"   Visual leet:      {result_df['visual_leet_count'].sum():,}")
print(f"   Symbol leet:      {result_df['symbol_leet_count'].sum():,}")
print(f"   Validated leet:   {result_df['validated_leet_count'].sum():,}")
print(f"   Weighted score:   {result_df['weighted_leet_score'].sum():.2f}")
print(f"   Avg weighted score/row: {result_df['weighted_leet_score'].mean():.4f}")

print(f"\nüìå SEPARATOR DETECTION:")
print(f"   Total separators:  {result_df['separator_count'].sum():,}")
print(f"   Rows with sep:     {(result_df['separator_count'] > 0).sum():,} ({(result_df['separator_count'] > 0).sum()/len(result_df)*100:.1f}%)")

# By label comparison
print(f"\nüìà COMPARISON BY LABEL:")
print("-" * 60)
for label in result_df['label'].unique():
    subset = result_df[result_df['label'] == label]
    label_name = "SPAM" if label == 1 else "HAM"
    print(f"\n   {label_name} (label={label}): {len(subset):,} rows")
    print(f"      Avg tokens:    {subset['token_count'].mean():.1f}")
    print(f"      Avg leet:      {subset['leet_count'].mean():.2f}")
    print(f"      Avg teencode:  {subset['teencode_count'].mean():.2f}")
    print(f"      Avg visual:    {subset['visual_leet_count'].mean():.2f}")
    print(f"      Avg symbol:    {subset['symbol_leet_count'].mean():.2f}")
    print(f"      Avg validated: {subset['validated_leet_count'].mean():.2f}")
    print(f"      Avg weighted:  {subset['weighted_leet_score'].mean():.4f}")
    print(f"      Avg separator: {subset['separator_count'].mean():.2f}")


üìä LAYER 2 SUMMARY STATISTICS:

üìù TOKEN STATISTICS:
   Total tokens:      115,391
   Avg tokens/row:    44.3
   Max tokens/row:    197

üî§ LEET DETECTION:
   Total leet chars:  13,418
   Rows with leet:    2,025 (77.8%)
   Avg leet/row:      5.15
   Teencode count:    8
   Visual leet:      172
   Symbol leet:      32
   Validated leet:   1,507
   Weighted score:   215.80
   Avg weighted score/row: 0.0829

üìå SEPARATOR DETECTION:
   Total separators:  25,241
   Rows with sep:     2,542 (97.7%)

üìà COMPARISON BY LABEL:
------------------------------------------------------------

   SPAM (label=1): 278 rows
      Avg tokens:    40.1
      Avg leet:      3.52
      Avg teencode:  0.02
      Avg visual:    0.33
      Avg symbol:    0.10
      Avg validated: 0.92
      Avg weighted:  0.4673
      Avg separator: 7.45

   HAM (label=0): 2,325 rows
      Avg tokens:    44.8
      Avg leet:      5.35
      Avg teencode:  0.00
      Avg visual:    0.03
      Avg symbol:    0.00
    

In [10]:
# ============================================================
# SAMPLE RESULTS
# ============================================================

print("\nüìã SAMPLE RESULTS (Layer 1 ‚Üí Layer 2):")
print("=" * 80)

# Show samples with leet detected
leet_samples = result_df[result_df['leet_count'] > 0].head(5)

for _, row in leet_samples.iterrows():
    print(f"\n[{row['index']}] Label: {'SPAM' if row['label']==1 else 'HAM'}")
    print(f"   Original:   {row['original_content'][:80]}...")
    print(f"   L1 Masked:  {row['layer1_masked'][:80]}...")
    print(f"   L2 Normalized: {row['layer2_normalized'][:80]}...")
    print(f"   Leet Stats: Total={row['leet_count']}, Teencode={row['teencode_count']}, Visual={row['visual_leet_count']}, Symbol={row['symbol_leet_count']}, Validated={row['validated_leet_count']}, Weighted={row['weighted_leet_score']:.2f}")
    print(f"   Sep: {row['separator_count']}, Tokens: {row['token_count']}")
    print(f"   Validated leet: {row['validated_leet_count']}")
    print("-" * 40)


üìã SAMPLE RESULTS (Layer 1 ‚Üí Layer 2):

[2] Label: SPAM
   Original:   Western Union TB: Vietcombank: 0071000986547. Tr·∫ßn Th·ªã Lan. Ref +19.56 USD. Nh·∫≠n...
   L1 Masked:  Western Union TB: Vietcombank: 0071000986547. Tr·∫ßn Th·ªã Lan. Ref +<MONEY>. Nh·∫≠n <...
   L2 Normalized: western union tb vietcombank 0071000986547 tr·∫ßn th·ªã lan ref t <MONEY> nh·∫≠n <MONE...
   Leet Stats: Total=21, Teencode=0, Visual=0, Symbol=0, Validated=2, Weighted=0.00
   Sep: 12, Tokens: 29
   Validated leet: 2
----------------------------------------

[3] Label: SPAM
   Original:   B·∫Øc, t√†i kho·∫£n t√†i ch√≠nh c·ªßa b·∫°n ƒë√£ ƒë∆∞·ª£c th√™m v√†o. T√†i kho·∫£n: Nay128 M·∫≠t kh·∫©u: y...
   L1 Masked:  B·∫Øc, t√†i kho·∫£n t√†i ch√≠nh c·ªßa b·∫°n ƒë√£ ƒë∆∞·ª£c th√™m v√†o. T√†i kho·∫£n: Nay128 M·∫≠t kh·∫©u: y...
   L2 Normalized: b·∫Øc t√†i kho·∫£n t√†i ch√≠nh c·ªßa b·∫°n ƒë√£ ƒë∆∞·ª£c th√™m v√†o t√†i kho·∫£n nay128 m·∫≠t kh·∫©u yk669...
   Leet Stats: Total=8, Teencode=0, Visual=0, Symbol=0, V

## Test Layer 3

In [11]:
import test_layer3

# Ch·∫°y to√†n b·ªô tests c·ªßa test_layer3.py b·∫±ng pytest
import pytest

result = pytest.main(["test_layer3.py", "-v"])
print("K·∫øt qu·∫£ pytest:", result)

platform win32 -- Python 3.13.5, pytest-9.0.2, pluggy-1.6.0 -- c:\Python313\python.exe
cachedir: .pytest_cache
rootdir: c:\IE403\IE403_DoAnCuoiKy\Smishing\misspell_detection\tests
plugins: anyio-4.12.0
[1mcollecting ... [0mcollected 44 items

test_layer3.py::TestBrandFiltering::test_bank_brand_vcb [32mPASSED[0m[32m           [  2%][0m
test_layer3.py::TestBrandFiltering::test_bank_brand_bidv [32mPASSED[0m[32m          [  4%][0m
test_layer3.py::TestBrandFiltering::test_bank_brand_vietinbank [32mPASSED[0m[32m    [  6%][0m
test_layer3.py::TestBrandFiltering::test_ewallet_brand_momo [32mPASSED[0m[32m       [  9%][0m
test_layer3.py::TestBrandFiltering::test_telco_brand_viettel [32mPASSED[0m[32m      [ 11%][0m
test_layer3.py::TestBrandFiltering::test_app_brand_tiktok [32mPASSED[0m[32m         [ 13%][0m
test_layer3.py::TestJargonFiltering::test_jargon_otp [32mPASSED[0m[32m              [ 15%][0m
test_layer3.py::TestJargonFiltering::test_jargon_sim [32mPASSED[0m

In [12]:
# ============================================================
# FULL PIPELINE: LAYER 1 ‚Üí LAYER 2 ‚Üí LAYER 3
# ============================================================

import pandas as pd
import sys
import json
from pathlib import Path

# Setup paths
ROOT_DIR = Path.cwd().parent.parent.parent  # IE403_DoAnCuoiKy/
sys.path.insert(0, str(ROOT_DIR))

from Smishing.preprocessing.layer1_masking import AggressiveMasker
from Smishing.misspell_detection.layer2_normalization import TextNormalizer
from Smishing.misspell_detection.layer3_whitelist import WhitelistFilter
from Smishing.data_loader import load_dataset

# Load dataset
DATA_PATH = ROOT_DIR / "data" / "dataset.csv"
OUTPUT_PATH = Path.cwd() / "layer3_whitelist_results.csv"

print(f"üìÇ Loading dataset from: {DATA_PATH}")
df = load_dataset(DATA_PATH)
print(f"‚úÖ Loaded {len(df):,} rows")

# Initialize all processors
masker = AggressiveMasker()
normalizer = TextNormalizer()
whitelist_filter = WhitelistFilter()

print(f"\n‚úì Layer 1: AggressiveMasker initialized")
print(f"‚úì Layer 2: TextNormalizer initialized")
print(f"‚úì Layer 3: WhitelistFilter initialized ({len(whitelist_filter.whitelist)} whitelist items)")

# Process all rows with FULL PIPELINE
print(f"\nüîÑ Processing {len(df):,} rows through Layer 1 ‚Üí Layer 2 ‚Üí Layer 3...")
results = []

for idx, row in df.iterrows():
    content = str(row.get("content", ""))
    label = row.get("label", "")
    
    try:
        # ===== LAYER 1: MASKING =====
        masked_text, mask_metadata = masker.mask(content)
        mask_counts = masker.get_entity_counts(mask_metadata)
        
        # ===== LAYER 2: NORMALIZATION =====
        norm_result = normalizer.normalize(masked_text)
        tokens = norm_result.tokens
        
        # ===== LAYER 3: WHITELIST FILTERING =====
        whitelist_result = whitelist_filter.filter(tokens)
        
    except Exception as e:
        print(f"Error at row {idx}: {e}")
        masked_text = content
        norm_result = None
        whitelist_result = None
        tokens = []
        mask_counts = {}
    
    # Build result row
    result = {
        "index": idx,
        "label": label,
        "original_content": content,
        # Layer 1
        "layer1_masked": masked_text,
        "url_count": mask_counts.get("url", 0) + mask_counts.get("zalo", 0) + mask_counts.get("telegram", 0),
        "phone_count": mask_counts.get("hotline", 0) + mask_counts.get("landline", 0) + 
                      mask_counts.get("mobile", 0) + mask_counts.get("shortcode", 0),
        "money_count": mask_counts.get("money", 0),
        "code_count": mask_counts.get("code", 0),
        "bank_acc_count": mask_counts.get("bank_acc", 0),
        # Layer 2
        "layer2_normalized": norm_result.normalized_text if norm_result else "",
        "layer2_tokens": str(tokens),
        "token_count": len(tokens),
        "leet_count": norm_result.leet_count if norm_result else 0,
        "teencode_count": norm_result.teencode_count if norm_result else 0,
        "visual_leet_count": norm_result.visual_leet_count if norm_result else 0,
        "symbol_leet_count": norm_result.symbol_leet_count if norm_result else 0,
        "validated_leet_count": norm_result.validated_leet_count if norm_result else 0,
        "weighted_leet_score": norm_result.weighted_leet_score if norm_result else 0.0,
        "separator_count": norm_result.separator_count if norm_result else 0,
        # Layer 3
        "tokens_to_check": str(whitelist_result.tokens_to_check) if whitelist_result else "[]",
        "whitelisted_tokens": str(whitelist_result.whitelisted_tokens) if whitelist_result else "[]",
        "whitelist_count": whitelist_result.whitelist_count if whitelist_result else 0,
        "tokens_to_check_count": len(whitelist_result.tokens_to_check) if whitelist_result else 0,
    }
    results.append(result)
    
    if (idx + 1) % 500 == 0:
        print(f"   Processed {idx + 1:,} / {len(df):,} rows...")

# Save results
result_df = pd.DataFrame(results)
result_df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8-sig")

print(f"\n‚úÖ Results saved to: {OUTPUT_PATH}")
print(f"   Total rows: {len(result_df):,}")

üìÇ Loading dataset from: c:\IE403\IE403_DoAnCuoiKy\data\dataset.csv
‚úÖ Loaded 2,603 rows from dataset.csv (standard parser)
‚úÖ Loaded 2,603 rows
‚úì Both dicts loaded: 78,258 words (full), 65,863 words (shadow) from c:\IE403\IE403_DoAnCuoiKy\Smishing\dicts\words.txt
‚úì Dictionary loaded: 78,258 words (full), 65,863 words (shadow)

‚úì Layer 1: AggressiveMasker initialized
‚úì Layer 2: TextNormalizer initialized
‚úì Layer 3: WhitelistFilter initialized (173 whitelist items)

üîÑ Processing 2,603 rows through Layer 1 ‚Üí Layer 2 ‚Üí Layer 3...
   Processed 500 / 2,603 rows...
   Processed 1,000 / 2,603 rows...
   Processed 1,500 / 2,603 rows...
   Processed 2,000 / 2,603 rows...
   Processed 2,500 / 2,603 rows...

‚úÖ Results saved to: c:\IE403\IE403_DoAnCuoiKy\Smishing\misspell_detection\tests\layer3_whitelist_results.csv
   Total rows: 2,603


In [13]:
# ============================================================
# LAYER 3 SUMMARY STATISTICS
# ============================================================

print("\nüìä FULL PIPELINE SUMMARY STATISTICS:")
print("=" * 60)

# Layer 1 Stats
print("\nüîí LAYER 1 - ENTITY MASKING:")
print(f"   URLs detected:      {result_df['url_count'].sum():,}")
print(f"   Phones detected:    {result_df['phone_count'].sum():,}")
print(f"   Money detected:     {result_df['money_count'].sum():,}")
print(f"   Codes detected:     {result_df['code_count'].sum():,}")
print(f"   Bank accounts detected: {result_df['bank_acc_count'].sum():,}")

# Layer 2 Stats
print(f"\nüî§ LAYER 2 - NORMALIZATION:")
print(f"   Total tokens:       {result_df['token_count'].sum():,}")
print(f"   Avg tokens/msg:     {result_df['token_count'].mean():.1f}")
print(f"   Total leet chars:   {result_df['leet_count'].sum():,}")
print(f"   Teencode count:     {result_df['teencode_count'].sum():,}")
print(f"   Visual leet:       {result_df['visual_leet_count'].sum():,}")
print(f"   Symbol leet:       {result_df['symbol_leet_count'].sum():,}")
print(f"   Validated leet:    {result_df['validated_leet_count'].sum():,}")
print(f"   Weighted score:    {result_df['weighted_leet_score'].sum():.2f}")
print(f"   Total separators:   {result_df['separator_count'].sum():,}")

# Layer 3 Stats
print(f"\nüìã LAYER 3 - WHITELIST FILTERING:")
print(f"   Total whitelist:    {result_df['whitelist_count'].sum():,}")
print(f"   Total to check:     {result_df['tokens_to_check_count'].sum():,}")
print(f"   Avg whitelist/msg:  {result_df['whitelist_count'].mean():.2f}")
print(f"   Avg to check/msg:   {result_df['tokens_to_check_count'].mean():.2f}")

# Filtering ratio
total_tokens = result_df['token_count'].sum()
tokens_filtered = result_df['whitelist_count'].sum()
tokens_remaining = result_df['tokens_to_check_count'].sum()

print(f"\nüìà FILTERING EFFICIENCY:")
print(f"   Total tokens input:       {total_tokens:,}")
print(f"   Tokens filtered out:      {tokens_filtered:,} ({tokens_filtered/total_tokens*100:.1f}%)")
print(f"   Tokens for spell check:   {tokens_remaining:,} ({tokens_remaining/total_tokens*100:.1f}%)")


üìä FULL PIPELINE SUMMARY STATISTICS:

üîí LAYER 1 - ENTITY MASKING:
   URLs detected:      1,192
   Phones detected:    2,157
   Money detected:     2,748
   Codes detected:     1,140
   Bank accounts detected: 0

üî§ LAYER 2 - NORMALIZATION:
   Total tokens:       115,391
   Avg tokens/msg:     44.3
   Total leet chars:   13,418
   Teencode count:     8
   Visual leet:       172
   Symbol leet:       32
   Validated leet:    1,507
   Weighted score:    215.80
   Total separators:   25,241

üìã LAYER 3 - WHITELIST FILTERING:
   Total whitelist:    23,431
   Total to check:     91,960
   Avg whitelist/msg:  9.00
   Avg to check/msg:   35.33

üìà FILTERING EFFICIENCY:
   Total tokens input:       115,391
   Tokens filtered out:      23,431 (20.3%)
   Tokens for spell check:   91,960 (79.7%)


In [14]:
# ============================================================
# COMPARISON BY LABEL
# ============================================================

print("\nüìä COMPARISON BY LABEL:")
print("=" * 60)

for label in [1, 0]:
    subset = result_df[result_df['label'] == label]
    label_name = "SPAM" if label == 1 else "HAM"
    
    print(f"\n{'üö®' if label == 1 else '‚úÖ'} {label_name} (label={label}): {len(subset):,} messages")
    print("-" * 40)
    print(f"   Avg tokens:           {subset['token_count'].mean():.1f}")
    print(f"   Avg leet chars:       {subset['leet_count'].mean():.2f}")
    print(f"   Avg teencode:         {subset['teencode_count'].mean():.2f}")
    print(f"   Avg visual leet:      {subset['visual_leet_count'].mean():.2f}")
    print(f"   Avg symbol leet:      {subset['symbol_leet_count'].mean():.2f}")
    print(f"   Avg validated leet:  {subset['validated_leet_count'].mean():.2f}")
    print(f"   Avg weighted score:   {subset['weighted_leet_score'].mean():.4f}")
    print(f"   Avg separators:       {subset['separator_count'].mean():.2f}")
    print(f"   Avg whitelist count:  {subset['whitelist_count'].mean():.2f}")
    print(f"   Avg tokens to check:  {subset['tokens_to_check_count'].mean():.2f}")
    
    # Whitelist ratio
    total = subset['token_count'].sum()
    filtered = subset['whitelist_count'].sum()
    print(f"   Whitelist ratio:      {filtered/total*100:.1f}%")


üìä COMPARISON BY LABEL:

üö® SPAM (label=1): 278 messages
----------------------------------------
   Avg tokens:           40.1
   Avg leet chars:       3.52
   Avg teencode:         0.02
   Avg visual leet:      0.33
   Avg symbol leet:      0.10
   Avg validated leet:  0.92
   Avg weighted score:   0.4673
   Avg separators:       7.45
   Avg whitelist count:  5.53
   Avg tokens to check:  34.60
   Whitelist ratio:      13.8%

‚úÖ HAM (label=0): 2,325 messages
----------------------------------------
   Avg tokens:           44.8
   Avg leet chars:       5.35
   Avg teencode:         0.00
   Avg visual leet:      0.03
   Avg symbol leet:      0.00
   Avg validated leet:  0.54
   Avg weighted score:   0.0369
   Avg separators:       9.97
   Avg whitelist count:  9.42
   Avg tokens to check:  35.42
   Whitelist ratio:      21.0%


In [15]:
# ============================================================
# SAMPLE RESULTS - FULL PIPELINE
# ============================================================

print("\nüìã SAMPLE RESULTS (Layer 1 ‚Üí Layer 2 ‚Üí Layer 3):")
print("=" * 80)

# Show samples with interesting whitelist filtering
samples = result_df[result_df['whitelist_count'] > 0].head(5)

for _, row in samples.iterrows():
    print(f"\n[{row['index']}] Label: {'SPAM' if row['label']==1 else 'HAM'}")
    print(f"   Original:    {row['original_content'][:70]}...")
    print(f"   L1 Masked:   {row['layer1_masked'][:70]}...")
    print(f"   L2 Tokens:   {row['layer2_tokens'][:70]}...")
    print(f"   ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")
    print(f"   üîç Whitelisted ({row['whitelist_count']}): {row['whitelisted_tokens'][:60]}...")
    print(f"   ‚úèÔ∏è  To check ({row['tokens_to_check_count']}):   {row['tokens_to_check'][:60]}...")
    
    # Show leet information if available
    if row['leet_count'] > 0:
        print(f"   üî§ Leet Info: Total={row['leet_count']}, Teencode={row['teencode_count']}, Visual={row['visual_leet_count']}, Symbol={row['symbol_leet_count']}, Validated={row['validated_leet_count']}, Weighted={row['weighted_leet_score']:.2f}")
    print("-" * 80)


üìã SAMPLE RESULTS (Layer 1 ‚Üí Layer 2 ‚Üí Layer 3):

[0] Label: SPAM
   Original:    [TRUNG T√ÇM PH√íNG CH·ªêNG GIAN L·∫¨N NG√ÇN H√ÄNG] √îng/B√† Nguy·ªÖn VƒÉn Minh Tr∆∞·ªõ...
   L1 Masked:   [TRUNG T√ÇM PH√íNG CH·ªêNG GIAN L·∫¨N NG√ÇN H√ÄNG] √îng/B√† Nguy·ªÖn VƒÉn Minh Tr∆∞·ªõ...
   L2 Tokens:   ['trung', 't√¢m', 'ph√≤ng', 'ch·ªëng', 'gian', 'l·∫≠n', 'ng√¢n', 'h√†ng', '√¥ng...
   ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
   üîç Whitelisted (5): ['<TIME>', '<MONEY>', 'tp', 'hcm', '<MONEY>']...
   ‚úèÔ∏è  To check (83):   ['trung', 't√¢m', 'ph√≤ng', 'ch·ªëng', 'gian', 'l·∫≠n', 'ng√¢n', 'h...
--------------------------------------------------------------------------------

[1] Label: SPAM
   Original:    [TB] Tien ich Loi nhan thoai cua Viettel: Quy khach co loi nhan tu TB ...
   L1 Masked:   [TB] Tien ich Loi nhan thoai cua Viettel: Quy khach co loi nhan tu TB ...
   L2 Tokens:   ['tb', 'tien', 'ich', 

## Test Layer 4

In [16]:
import test_layer4

# Ch·∫°y to√†n b·ªô tests c·ªßa test_layer4.py b·∫±ng pytest
import pytest

result = pytest.main(["test_layer4.py", "-v"])
print("K·∫øt qu·∫£ pytest:", result)

platform win32 -- Python 3.13.5, pytest-9.0.2, pluggy-1.6.0 -- c:\Python313\python.exe
cachedir: .pytest_cache
rootdir: c:\IE403\IE403_DoAnCuoiKy\Smishing\misspell_detection\tests
plugins: anyio-4.12.0
[1mcollecting ... [0mcollected 12 items

test_layer4.py::TestOOVDetection::test_valid_words_full_dict [32mPASSED[0m[32m      [  8%][0m
test_layer4.py::TestOOVDetection::test_valid_words_shadow_dict [32mPASSED[0m[32m    [ 16%][0m
test_layer4.py::TestOOVDetection::test_oov_word [32mPASSED[0m[32m                   [ 25%][0m
test_layer4.py::TestOOVDetection::test_case_insensitive [32mPASSED[0m[32m           [ 33%][0m
test_layer4.py::TestOOVDetection::test_ignore_digits_and_short [32mPASSED[0m[32m    [ 41%][0m
test_layer4.py::TestAdvancedFeatures::test_broken_telex [32mPASSED[0m[32m           [ 50%][0m
test_layer4.py::TestAdvancedFeatures::test_gibberish [32mPASSED[0m[32m              [ 58%][0m
test_layer4.py::TestAdvancedFeatures::test_repeated_chars [32mPASSED

In [17]:
# ============================================================
# FULL PIPELINE: LAYER 1 ‚Üí LAYER 2 ‚Üí LAYER 3 ‚Üí LAYER 4
# ============================================================

import pandas as pd
import sys
import json
from pathlib import Path

# Setup paths
ROOT_DIR = Path.cwd().parent.parent.parent  # IE403_DoAnCuoiKy/
sys.path.insert(0, str(ROOT_DIR))

from Smishing.preprocessing.layer1_masking import AggressiveMasker
from Smishing.misspell_detection.layer2_normalization import TextNormalizer
from Smishing.misspell_detection.layer3_whitelist import WhitelistFilter
from Smishing.misspell_detection.layer4_misspell import MisspellExtractor
from Smishing.data_loader import load_dataset

# Load dataset
DATA_PATH = ROOT_DIR / "data" / "dataset.csv"
OUTPUT_PATH = Path.cwd() / "layer4_misspell_results.csv"

print(f"üìÇ Loading dataset from: {DATA_PATH}")
df = load_dataset(DATA_PATH)
print(f"‚úÖ Loaded {len(df):,} rows")

# Initialize all processors
masker = AggressiveMasker()
normalizer = TextNormalizer()
whitelist_filter = WhitelistFilter()
misspell_extractor = MisspellExtractor(full_dict=normalizer.full_dict, 
                                       shadow_dict=normalizer.shadow_dict)

print(f"\n‚úì Layer 1: AggressiveMasker initialized")
print(f"‚úì Layer 2: TextNormalizer initialized")
print(f"‚úì Layer 3: WhitelistFilter initialized ({len(whitelist_filter.whitelist)} whitelist items)")
print(f"‚úì Layer 4: MisspellExtractor initialized")

# Process all rows with FULL PIPELINE

print(f"\nüîÑ Processing {len(df):,} rows through Layer 1 ‚Üí Layer 2 ‚Üí Layer 3 ‚Üí Layer 4...")
results = []

for idx, row in df.iterrows():
    content = str(row.get("content", ""))
    label = row.get("label", "")

    try:
        # ===== LAYER 1: MASKING =====
        masked_text, mask_metadata = masker.mask(content)
        mask_counts = masker.get_entity_counts(mask_metadata)
        
        # ===== LAYER 2: NORMALIZATION =====
        norm_result = normalizer.normalize(masked_text)
        tokens = norm_result.tokens
        
        # ===== LAYER 3: WHITELIST FILTERING =====
        whitelist_result = whitelist_filter.filter(tokens)
        tokens_to_check = whitelist_result.tokens_to_check
        
        # ===== LAYER 4: MISPELL EXTRACTION =====
        misspell_result = misspell_extractor.extract(tokens_to_check)

    except Exception as e:
        print(f"Error at row {idx}: {e}")
        masked_text = content
        norm_result = None
        whitelist_result = None
        misspell_result = None
        tokens = []
        tokens_to_check = []
        mask_counts = {}
        
    # Build result row
    result = {
        "index": idx,
        "label": label,
        "original_content": content,
        # Layer 1
        "layer1_masked": masked_text,
        "url_count": mask_counts.get("url", 0) + mask_counts.get("zalo", 0) + mask_counts.get("telegram", 0),
        "phone_count": mask_counts.get("hotline", 0) + mask_counts.get("landline", 0) + 
                      mask_counts.get("mobile", 0) + mask_counts.get("shortcode", 0),
        "money_count": mask_counts.get("money", 0),
        "code_count": mask_counts.get("code", 0),
        # Layer 2
        "layer2_normalized": norm_result.normalized_text if norm_result else "",
        "layer2_tokens": str(tokens),
        "token_count": len(tokens),
        "leet_count": norm_result.leet_count if norm_result else 0,
        "teencode_count": norm_result.teencode_count if norm_result else 0,
        "visual_leet_count": norm_result.visual_leet_count if norm_result else 0,
        "symbol_leet_count": norm_result.symbol_leet_count if norm_result else 0,
        "validated_leet_count": norm_result.validated_leet_count if norm_result else 0,
        "weighted_leet_score": norm_result.weighted_leet_score if norm_result else 0.0,
        "separator_count": norm_result.separator_count if norm_result else 0,
        # Layer 3
        "tokens_to_check": str(whitelist_result.tokens_to_check) if whitelist_result else "[]",
        "whitelisted_tokens": str(whitelist_result.whitelisted_tokens) if whitelist_result else "[]",
        "whitelist_count": whitelist_result.whitelist_count if whitelist_result else 0,
        "tokens_to_check_count": len(whitelist_result.tokens_to_check) if whitelist_result else 0,
        # Layer 4
        "oov_count": misspell_result.oov_count if misspell_result else 0,
        "oov_density": misspell_result.oov_density if misspell_result else 0.0,
        "broken_telex_count": misspell_result.broken_telex_count if misspell_result else 0,
        "longest_oov_length": misspell_result.longest_oov_length if misspell_result else 0,
        "oov_tokens": str(misspell_result.oov_tokens) if misspell_result else "[]",
    }
    results.append(result)

    if (idx + 1) % 500 == 0:
        print(f"   Processed {idx + 1:,} / {len(df):,} rows...")

# Save results
result_df = pd.DataFrame(results)
result_df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8-sig")

print(f"\n‚úÖ Results saved to: {OUTPUT_PATH}")
print(f"   Total rows: {len(result_df):,}")

üìÇ Loading dataset from: c:\IE403\IE403_DoAnCuoiKy\data\dataset.csv
‚úÖ Loaded 2,603 rows from dataset.csv (standard parser)
‚úÖ Loaded 2,603 rows
‚úì Both dicts loaded: 78,258 words (full), 65,863 words (shadow) from c:\IE403\IE403_DoAnCuoiKy\Smishing\dicts\words.txt
‚úì Dictionary loaded: 78,258 words (full), 65,863 words (shadow)

‚úì Layer 1: AggressiveMasker initialized
‚úì Layer 2: TextNormalizer initialized
‚úì Layer 3: WhitelistFilter initialized (173 whitelist items)
‚úì Layer 4: MisspellExtractor initialized

üîÑ Processing 2,603 rows through Layer 1 ‚Üí Layer 2 ‚Üí Layer 3 ‚Üí Layer 4...
   Processed 500 / 2,603 rows...
   Processed 1,000 / 2,603 rows...
   Processed 1,500 / 2,603 rows...
   Processed 2,000 / 2,603 rows...
   Processed 2,500 / 2,603 rows...

‚úÖ Results saved to: c:\IE403\IE403_DoAnCuoiKy\Smishing\misspell_detection\tests\layer4_misspell_results.csv
   Total rows: 2,603


In [18]:
# ============================================================
# LAYER 4 SUMMARY STATISTICS
# ============================================================

print("\nüìä FULL PIPELINE SUMMARY STATISTICS:")
print("=" * 60)

# Layer 1 Stats
print("\nüîí LAYER 1 - ENTITY MASKING:")
print(f"   URLs detected:      {result_df['url_count'].sum():,}")
print(f"   Phones detected:    {result_df['phone_count'].sum():,}")
print(f"   Money detected:     {result_df['money_count'].sum():,}")
print(f"   Codes detected:     {result_df['code_count'].sum():,}")

# Layer 2 Stats
print(f"\nüî§ LAYER 2 - NORMALIZATION:")
print(f"   Total tokens:       {result_df['token_count'].sum():,}")
print(f"   Avg tokens/msg:     {result_df['token_count'].mean():.1f}")
print(f"   Total leet chars:   {result_df['leet_count'].sum():,}")
print(f"   Teencode count:     {result_df['teencode_count'].sum():,}")
print(f"   Visual leet:       {result_df['visual_leet_count'].sum():,}")
print(f"   Symbol leet:       {result_df['symbol_leet_count'].sum():,}")
print(f"   Validated leet:    {result_df['validated_leet_count'].sum():,}")
print(f"   Weighted score:    {result_df['weighted_leet_score'].sum():.2f}")
print(f"   Total separators:   {result_df['separator_count'].sum():,}")

# Layer 3 Stats
print(f"\nüìã LAYER 3 - WHITELIST FILTERING:")
print(f"   Total whitelist:    {result_df['whitelist_count'].sum():,}")
print(f"   Total to check:     {result_df['tokens_to_check_count'].sum():,}")
print(f"   Avg whitelist/msg:  {result_df['whitelist_count'].mean():.2f}")
print(f" Filltering_ratio:    {result_df['whitelist_count'].sum() / result_df['token_count'].sum() * 100:.1f}%")

# Layer 4 Stats
print(f"\nüìã LAYER 4 - MISPELL EXTRACTION:")
print(f"   Total OOV tokens:          {result_df['oov_count'].sum():,}")
print(f"   Avg OOV/msg:              {result_df['oov_count'].mean():.2f}")
print(f"   Avg OOV density:          {result_df['oov_density'].mean():.2f}")
print(f"   Total broken telex:       {result_df['broken_telex_count'].sum():,}")
print(f"   Max OOV length:           {result_df['longest_oov_length'].max():,}")
print(f"   Messages with OOV:       {(result_df['oov_count'] > 0).sum():,} ({(result_df['oov_count'] > 0).sum()/len(result_df)*100:.1f}")


# Validation efficiency
total_checked = result_df['tokens_to_check_count'].sum()
total_oov = result_df['oov_count'].sum()

print(f"\nüìà VALIDATION EFFICIENCY:")
print(f"   Total tokens checked:     {total_checked:,}")
print(f"   OOV detected:             {total_oov:,} ({total_oov/total_checked*100:.2f}%)")
print(f"   Valid tokens:             {total_checked - total_oov:,} ({100 - total_oov/total_checked*100:.2f}%)")



üìä FULL PIPELINE SUMMARY STATISTICS:

üîí LAYER 1 - ENTITY MASKING:
   URLs detected:      1,192
   Phones detected:    2,157
   Money detected:     2,748
   Codes detected:     1,140

üî§ LAYER 2 - NORMALIZATION:
   Total tokens:       115,391
   Avg tokens/msg:     44.3
   Total leet chars:   13,418
   Teencode count:     8
   Visual leet:       172
   Symbol leet:       32
   Validated leet:    1,507
   Weighted score:    215.80
   Total separators:   25,241

üìã LAYER 3 - WHITELIST FILTERING:
   Total whitelist:    23,431
   Total to check:     91,960
   Avg whitelist/msg:  9.00
 Filltering_ratio:    20.3%

üìã LAYER 4 - MISPELL EXTRACTION:
   Total OOV tokens:          3,580
   Avg OOV/msg:              1.38
   Avg OOV density:          0.08
   Total broken telex:       220
   Max OOV length:           16
   Messages with OOV:       1,421 (54.6

üìà VALIDATION EFFICIENCY:
   Total tokens checked:     91,960
   OOV detected:             3,580 (3.89%)
   Valid tokens:       

In [19]:
# ============================================================
# COMPARISON BY LABEL
# ============================================================

print("\nüìä COMPARISON BY LABEL (FULL PIPELINE):")
print("=" * 60)

for label in [1, 0]:
    subset = result_df[result_df['label'] == label]
    label_name = "SMISHING" if label == 1 else "LEGIT"
    
    print(f"\n{'üö®' if label == 1 else '‚úÖ'} {label_name} (label={label}): {len(subset):,} messages")
    print("-" * 40)

    # Layer 2 features
    print(f"\nüî§ LAYER 2 - NORMALIZATION:")
    print(f"   Avg tokens/msg:     {subset['token_count'].mean():.1f}")
    print(f"   Avg validated leet: {subset['validated_leet_count'].mean():.2f}")
    print(f"   Avg weighted score: {subset['weighted_leet_score'].mean():.4f}")

    # Layer 3 features
    print(f"\nüìã LAYER 3 - WHITELIST FILTERING:")
    print(f"   Avg whitelist/msg:  {subset['whitelist_count'].mean():.2f}")
    print(f"   Avg tokens to check/msg:   {subset['tokens_to_check_count'].mean():.2f}")
    print(f"   Filltering_ratio:   {subset['whitelist_count'].sum() / subset['token_count'].sum() * 100:.1f}%")

    # Layer 4 features
    print(f"\nüìã LAYER 4 - MISPELL EXTRACTION:")
    print(f"   Avg OOV/msg:        {subset['oov_count'].mean():.2f}")
    print(f"   Avg OOV density:    {subset['oov_density'].mean():.2f}")
    print(f"   Avg broken telex:   {subset['broken_telex_count'].mean():.2f}")
    print(f"   Max OOV length:     {subset['longest_oov_length'].max():,}")
    print(f"   Messages with OOV: {(subset['oov_count'] > 0).sum():,} ({(subset['oov_count'] > 0).sum()/len(subset)*100:.1f}%)")

    # Combined features
    print(f"\nüîç COMBINED FEATURES:")
    print(f"   Avg leet + OOV:      {subset['leet_count'].mean() + subset['oov_count'].mean():.2f}")
    print(f"  Avg weight + density: {subset['weighted_leet_score'].mean() + subset['oov_density'].mean():.4f}")


üìä COMPARISON BY LABEL (FULL PIPELINE):

üö® SMISHING (label=1): 278 messages
----------------------------------------

üî§ LAYER 2 - NORMALIZATION:
   Avg tokens/msg:     40.1
   Avg validated leet: 0.92
   Avg weighted score: 0.4673

üìã LAYER 3 - WHITELIST FILTERING:
   Avg whitelist/msg:  5.53
   Avg tokens to check/msg:   34.60
   Filltering_ratio:   13.8%

üìã LAYER 4 - MISPELL EXTRACTION:
   Avg OOV/msg:        2.29
   Avg OOV density:    0.11
   Avg broken telex:   0.16
   Max OOV length:     16
   Messages with OOV: 194 (69.8%)

üîç COMBINED FEATURES:
   Avg leet + OOV:      5.81
  Avg weight + density: 0.5760

‚úÖ LEGIT (label=0): 2,325 messages
----------------------------------------

üî§ LAYER 2 - NORMALIZATION:
   Avg tokens/msg:     44.8
   Avg validated leet: 0.54
   Avg weighted score: 0.0369

üìã LAYER 3 - WHITELIST FILTERING:
   Avg whitelist/msg:  9.42
   Avg tokens to check/msg:   35.42
   Filltering_ratio:   21.0%

üìã LAYER 4 - MISPELL EXTRACTION:
   A