## Test 1 - Masking Layer

In [1]:
import test_layer1

In [2]:
# Ch·∫°y to√†n b·ªô tests c·ªßa test_layer1.py b·∫±ng pytest
import pytest

result = pytest.main(["test_layer1.py", "-v"])
print("K·∫øt qu·∫£ pytest:", result)


platform win32 -- Python 3.13.5, pytest-9.0.2, pluggy-1.6.0 -- c:\Python313\python.exe
cachedir: .pytest_cache
rootdir: c:\IE403\IE403_DoAnCuoiKy\Smishing\misspell_detection\tests
plugins: anyio-4.12.0
[1mcollecting ... [0mcollected 39 items

test_layer1.py::TestURLMasking::test_standard_url_with_https [32mPASSED[0m[32m      [  2%][0m
test_layer1.py::TestURLMasking::test_standard_url_with_www [32mPASSED[0m[32m        [  5%][0m
test_layer1.py::TestURLMasking::test_url_shortener_bitly [32mPASSED[0m[32m          [  7%][0m
test_layer1.py::TestURLMasking::test_aggressive_url_with_spaces [32mPASSED[0m[32m   [ 10%][0m
test_layer1.py::TestURLMasking::test_spam_tld_icu [32mPASSED[0m[32m                 [ 12%][0m
test_layer1.py::TestURLMasking::test_spam_tld_vip [32mPASSED[0m[32m                 [ 15%][0m
test_layer1.py::TestURLMasking::test_multiple_urls [32mPASSED[0m[32m                [ 17%][0m
test_layer1.py::TestZaloTelegramMasking::test_zalo_link [32mPASSED[

In [3]:
import pandas as pd
import sys
from pathlib import Path

# Setup paths
ROOT_DIR = Path.cwd().parent.parent.parent  # IE403_DoAnCuoiKy/
sys.path.insert(0, str(ROOT_DIR))

from Smishing.preprocessing.layer1_masking import AggressiveMasker
from Smishing.data_loader import load_dataset

# Load dataset
DATA_PATH = ROOT_DIR / "data" / "dataset.csv"
OUTPUT_PATH = Path.cwd() / "layer1_masking_results.csv"

print(f"üìÇ Loading dataset from: {DATA_PATH}")
df = load_dataset(DATA_PATH)
print(f"‚úÖ Loaded {len(df):,} rows")

# Initialize masker
masker = AggressiveMasker()

# Process all rows
print(f"\nüîÑ Processing {len(df):,} rows...")
results = []

for idx, row in df.iterrows():
    content = str(row.get("content", ""))
    label = row.get("label", "")
    
    try:
        masked_text, metadata = masker.mask(content)
        counts = masker.get_entity_counts(metadata)
    except Exception as e:
        masked_text = f"ERROR: {e}"
        metadata = {}
        counts = {}
    
    result = {
        "index": idx,
        "label": label,
        "original_content": content,
        "masked_content": masked_text,
        "url_count": counts.get("url", 0) + counts.get("zalo", 0) + counts.get("telegram", 0),
        "phone_count": counts.get("hotline", 0) + counts.get("landline", 0) + 
                      counts.get("mobile", 0) + counts.get("shortcode", 0),
        "money_count": counts.get("money", 0),
        "code_count": counts.get("code", 0),
        "email_count": counts.get("email", 0),
        "datetime_count": counts.get("datetime", 0),
        "raw_metadata": str(metadata),
    }
    results.append(result)
    
    if (idx + 1) % 500 == 0:
        print(f"   Processed {idx + 1:,} / {len(df):,} rows...")

# Save results
result_df = pd.DataFrame(results)
result_df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8-sig")

print(f"\n‚úÖ Results saved to: {OUTPUT_PATH}")
print(f"   Total rows: {len(result_df):,}")

üìÇ Loading dataset from: c:\IE403\IE403_DoAnCuoiKy\data\dataset.csv
‚úÖ Loaded 2,603 rows from dataset.csv (standard parser)
‚úÖ Loaded 2,603 rows

üîÑ Processing 2,603 rows...
   Processed 500 / 2,603 rows...
   Processed 1,000 / 2,603 rows...
   Processed 1,500 / 2,603 rows...
   Processed 2,000 / 2,603 rows...
   Processed 2,500 / 2,603 rows...

‚úÖ Results saved to: c:\IE403\IE403_DoAnCuoiKy\Smishing\misspell_detection\tests\layer1_masking_results.csv
   Total rows: 2,603


In [4]:
# Summary statistics
print("\nüìä SUMMARY STATISTICS:")
print("-" * 50)
print(f"   URLs detected:      {result_df['url_count'].sum():,}")
print(f"   Phones detected:    {result_df['phone_count'].sum():,}")
print(f"   Money detected:     {result_df['money_count'].sum():,}")
print(f"   Codes detected:     {result_df['code_count'].sum():,}")
print(f"   Emails detected:    {result_df['email_count'].sum():,}")
print(f"   DateTimes detected: {result_df['datetime_count'].sum():,}")

# Rows with at least one entity
has_entity = result_df[['url_count', 'phone_count', 'money_count', 'code_count']].sum(axis=1) > 0
print(f"\n   Rows with entities: {has_entity.sum():,} / {len(result_df):,} ({has_entity.sum()/len(result_df)*100:.1f}%)")


üìä SUMMARY STATISTICS:
--------------------------------------------------
   URLs detected:      1,395
   Phones detected:    2,151
   Money detected:     2,952
   Codes detected:     1,087
   Emails detected:    7
   DateTimes detected: 2,742

   Rows with entities: 2,310 / 2,603 (88.7%)


In [5]:
# Show sample results
print("\nüìã SAMPLE RESULTS (first 10 rows with changes):")
print("=" * 80)

# Filter rows that have changes
changed_rows = result_df[result_df['original_content'] != result_df['masked_content']]

for _, row in changed_rows.head(10).iterrows():
    print(f"\n[{row['index']}] Label: {row['label']}")
    print(f"   Original: {row['original_content'][:100]}...")
    print(f"   Masked:   {row['masked_content'][:100]}...")
    print(f"   Counts:   URL={row['url_count']}, Phone={row['phone_count']}, Money={row['money_count']}, Code={row['code_count']}")


üìã SAMPLE RESULTS (first 10 rows with changes):

[0] Label: 1
   Original: [TRUNG T√ÇM PH√íNG CH·ªêNG GIAN L·∫¨N NG√ÇN H√ÄNG] √îng/B√† Nguy·ªÖn VƒÉn Minh Tr∆∞·ªõc 17h ng√†y h√¥m nay kh√¥ng thanh...
   Masked:   [TRUNG T√ÇM PH√íNG CH·ªêNG GIAN L·∫¨N NG√ÇN H√ÄNG] √îng/B√† Nguy·ªÖn VƒÉn Minh Tr∆∞·ªõc <TIME> ng√†y h√¥m nay kh√¥ng th...
   Counts:   URL=0, Phone=0, Money=2, Code=0

[1] Label: 1
   Original: [TB] Tien ich Loi nhan thoai cua Viettel: Quy khach co loi nhan tu TB 0848836182 vao luc 08:09 27/03...
   Masked:   [TB] Tien ich Loi nhan thoai cua Viettel: Quy khach co loi nhan tu TB <PHONE> vao luc <TIME> <TIME>....
   Counts:   URL=0, Phone=2, Money=1, Code=0

[2] Label: 1
   Original: Western Union TB: Vietcombank: 0071000986547. Tr·∫ßn Th·ªã Lan. Ref +19.56 USD. Nh·∫≠n 500.000 VND. Ngay 0...
   Masked:   Western Union TB: Vietcombank: 0071000986547. Tr·∫ßn Th·ªã Lan. Ref +<MONEY>. Nh·∫≠n <MONEY>. Ngay <TIME>....
   Counts:   URL=1, Phone=0, Money=2, Code=0

[3] Label: 1
   Ori

## Test Layer 2

In [6]:
import test_layer2

In [7]:
# Ch·∫°y to√†n b·ªô tests c·ªßa test_layer1.py b·∫±ng pytest
import pytest

result = pytest.main(["test_layer2.py", "-v"])
print("K·∫øt qu·∫£ pytest:", result)

platform win32 -- Python 3.13.5, pytest-9.0.2, pluggy-1.6.0 -- c:\Python313\python.exe
cachedir: .pytest_cache
rootdir: c:\IE403\IE403_DoAnCuoiKy\Smishing\misspell_detection\tests
plugins: anyio-4.12.0
[1mcollecting ... [0mcollected 46 items

test_layer2.py::TestLeetspeak::test_leet_digit_0_to_o [31mFAILED[0m[31m             [  2%][0m
test_layer2.py::TestLeetspeak::test_leet_digit_1_to_i [32mPASSED[0m[31m             [  4%][0m
test_layer2.py::TestLeetspeak::test_leet_digit_3_to_e [32mPASSED[0m[31m             [  6%][0m
test_layer2.py::TestLeetspeak::test_leet_digit_4_to_a [32mPASSED[0m[31m             [  8%][0m
test_layer2.py::TestLeetspeak::test_leet_symbol_exclamation_to_i [32mPASSED[0m[31m  [ 10%][0m
test_layer2.py::TestLeetspeak::test_leet_symbol_at_to_a [32mPASSED[0m[31m           [ 13%][0m
test_layer2.py::TestLeetspeak::test_leet_symbol_dollar_to_s [32mPASSED[0m[31m       [ 15%][0m
test_layer2.py::TestLeetspeak::test_leet_char_j_to_i [32mPASSED[0m

In [8]:
# ============================================================
# LAYER 2: APPLY NORMALIZATION TO DATASET
# ============================================================

import pandas as pd
import sys
import json
from pathlib import Path

# Setup paths
ROOT_DIR = Path.cwd().parent.parent.parent  # IE403_DoAnCuoiKy/
sys.path.insert(0, str(ROOT_DIR))

from Smishing.preprocessing.layer1_masking import AggressiveMasker
from Smishing.misspell_detection.layer2_normalization import TextNormalizer
from Smishing.data_loader import load_dataset

# Load dataset
DATA_PATH = ROOT_DIR / "data" / "dataset.csv"
OUTPUT_PATH = Path.cwd() / "layer2_normalization_results.csv"

print(f"üìÇ Loading dataset from: {DATA_PATH}")
df = load_dataset(DATA_PATH)
print(f"‚úÖ Loaded {len(df):,} rows")

# Initialize processors
masker = AggressiveMasker()
normalizer = TextNormalizer()

# Process all rows with FULL PIPELINE: Layer 1 ‚Üí Layer 2
print(f"\nüîÑ Processing {len(df):,} rows through Layer 1 + Layer 2...")
results = []

for idx, row in df.iterrows():
    content = str(row.get("content", ""))
    label = row.get("label", "")
    
    try:
        # Layer 1: Masking
        masked_text, mask_metadata = masker.mask(content)
        mask_counts = masker.get_entity_counts(mask_metadata)
        
        # Layer 2: Normalization (on masked text)
        norm_result = normalizer.normalize(masked_text)
        
    except Exception as e:
        masked_text = f"ERROR: {e}"
        mask_counts = {}
        norm_result = None
    
    result = {
        "index": idx,
        "label": label,
        "original_content": content,
        "layer1_masked": masked_text,
        "layer2_normalized": norm_result.normalized_text if norm_result else "",
        "layer2_tokens": str(norm_result.tokens) if norm_result else "[]",
        "token_count": len(norm_result.tokens) if norm_result else 0,
        "leet_count": norm_result.leet_count if norm_result else 0,
        "leet_word_count": norm_result.leet_word_count if norm_result else 0,
        "leet_density": norm_result.leet_density if norm_result else 0.0,
        "leet_words": json.dumps(norm_result.leet_words, ensure_ascii=False) if norm_result and norm_result.leet_words else "[]",
        "leet_patterns_used": json.dumps(norm_result.leet_patterns_used, ensure_ascii=False) if norm_result and norm_result.leet_patterns_used else "{}",
        "separator_count": norm_result.separator_count if norm_result else 0,
        # Layer 1 counts
        "url_count": mask_counts.get("url", 0) + mask_counts.get("zalo", 0) + mask_counts.get("telegram", 0),
        "phone_count": mask_counts.get("hotline", 0) + mask_counts.get("landline", 0) + 
                      mask_counts.get("mobile", 0) + mask_counts.get("shortcode", 0),
        "money_count": mask_counts.get("money", 0),
        "code_count": mask_counts.get("code", 0),
    }
    results.append(result)
    
    if (idx + 1) % 500 == 0:
        print(f"   Processed {idx + 1:,} / {len(df):,} rows...")

# Save results
result_df = pd.DataFrame(results)
result_df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8-sig")

print(f"\n‚úÖ Results saved to: {OUTPUT_PATH}")
print(f"   Total rows: {len(result_df):,}")

üìÇ Loading dataset from: c:\IE403\IE403_DoAnCuoiKy\data\dataset.csv
‚úÖ Loaded 2,603 rows from dataset.csv (standard parser)
‚úÖ Loaded 2,603 rows

üîÑ Processing 2,603 rows through Layer 1 + Layer 2...
   Processed 500 / 2,603 rows...
   Processed 1,000 / 2,603 rows...
   Processed 1,500 / 2,603 rows...
   Processed 2,000 / 2,603 rows...
   Processed 2,500 / 2,603 rows...

‚úÖ Results saved to: c:\IE403\IE403_DoAnCuoiKy\Smishing\misspell_detection\tests\layer2_normalization_results.csv
   Total rows: 2,603


In [9]:
# ============================================================
# LAYER 2: SUMMARY STATISTICS
# ============================================================

print("\nüìä LAYER 2 SUMMARY STATISTICS:")
print("=" * 60)

# Basic counts
print(f"\nüìù TOKEN STATISTICS:")
print(f"   Total tokens:      {result_df['token_count'].sum():,}")
print(f"   Avg tokens/row:    {result_df['token_count'].mean():.1f}")
print(f"   Max tokens/row:    {result_df['token_count'].max()}")

print(f"\nüî§ LEET DETECTION:")
print(f"   Total leet chars:  {result_df['leet_count'].sum():,}")
print(f"   Rows with leet:    {(result_df['leet_count'] > 0).sum():,} ({(result_df['leet_count'] > 0).sum()/len(result_df)*100:.1f}%)")
print(f"   Avg leet/row:      {result_df['leet_count'].mean():.2f}")
print(f"   Total leet words:  {result_df['leet_word_count'].sum():,}")
print(f"   Avg leet words/row: {result_df['leet_word_count'].mean():.2f}")
print(f"   Avg leet density:  {result_df['leet_density'].mean():.4f}")

print(f"\nüìå SEPARATOR DETECTION:")
print(f"   Total separators:  {result_df['separator_count'].sum():,}")
print(f"   Rows with sep:     {(result_df['separator_count'] > 0).sum():,} ({(result_df['separator_count'] > 0).sum()/len(result_df)*100:.1f}%)")

# By label comparison
print(f"\nüìà COMPARISON BY LABEL:")
print("-" * 60)
for label in result_df['label'].unique():
    subset = result_df[result_df['label'] == label]
    label_name = "SPAM" if label == 1 else "HAM"
    print(f"\n   {label_name} (label={label}): {len(subset):,} rows")
    print(f"      Avg tokens:    {subset['token_count'].mean():.1f}")
    print(f"      Avg leet:      {subset['leet_count'].mean():.2f}")
    print(f"      Avg leet words: {subset['leet_word_count'].mean():.2f}")
    print(f"      Avg leet density: {subset['leet_density'].mean():.4f}")
    print(f"      Avg separator: {subset['separator_count'].mean():.2f}")


üìä LAYER 2 SUMMARY STATISTICS:

üìù TOKEN STATISTICS:
   Total tokens:      114,904
   Avg tokens/row:    44.1
   Max tokens/row:    197

üî§ LEET DETECTION:
   Total leet chars:  19
   Rows with leet:    18 (0.7%)
   Avg leet/row:      0.01
   Total leet words:  19
   Avg leet words/row: 0.01
   Avg leet density:  0.0001

üìå SEPARATOR DETECTION:
   Total separators:  24,360
   Rows with sep:     2,541 (97.6%)

üìà COMPARISON BY LABEL:
------------------------------------------------------------

   SPAM (label=1): 278 rows
      Avg tokens:    40.0
      Avg leet:      0.06
      Avg leet words: 0.06
      Avg leet density: 0.0005
      Avg separator: 7.21

   HAM (label=0): 2,325 rows
      Avg tokens:    44.6
      Avg leet:      0.00
      Avg leet words: 0.00
      Avg leet density: 0.0000
      Avg separator: 9.62


In [10]:
# ============================================================
# SAMPLE RESULTS
# ============================================================

import json

print("\nüìã SAMPLE RESULTS (Layer 1 ‚Üí Layer 2):")
print("=" * 80)

# Show samples with leet detected
leet_samples = result_df[result_df['leet_count'] > 0].head(5)

for _, row in leet_samples.iterrows():
    print(f"\n[{row['index']}] Label: {'SPAM' if row['label']==1 else 'HAM'}")
    print(f"   Original:   {row['original_content'][:80]}...")
    print(f"   L1 Masked:  {row['layer1_masked'][:80]}...")
    print(f"   L2 Normalized: {row['layer2_normalized'][:80]}...")
    print(f"   Leet: {row['leet_count']}, Leet Words: {row['leet_word_count']}, Density: {row['leet_density']:.4f}, Sep: {row['separator_count']}, Tokens: {row['token_count']}")
    
    # Show leet words if available
    if row['leet_words'] and row['leet_words'] != "[]":
        try:
            leet_words = json.loads(row['leet_words'])
            if leet_words:
                print(f"   Leet Words Found: {len(leet_words)}")
                for i, word_info in enumerate(leet_words[:3]):  # Show first 3
                    print(f"      - '{word_info.get('original', '')}' -> '{word_info.get('decoded', '')}' ({word_info.get('leet_chars', 0)} leet chars)")
        except:
            pass
    print("-" * 40)


üìã SAMPLE RESULTS (Layer 1 ‚Üí Layer 2):

[16] Label: SPAM
   Original:   Th0ng ba0:BIDV nang cap he thong. Vui l0ng dang nhap https://b0dv.xyz va nang ca...
   L1 Masked:  Th0ng ba0:BIDV nang cap he thong. Vui l0ng dang nhap <URL> va nang cap. Neu kh0n...
   L2 Normalized: thong bao bidv nang cap he thong vui long dang nhap <URL> va nang cap neu khong ...
   Leet: 1, Leet Words: 1, Density: 0.0093, Sep: 4, Tokens: 23
   Leet Words Found: 1
      - 'ba0' -> 'bao' (1 leet chars)
----------------------------------------

[29] Label: SPAM
   Original:   LENH TRUY NA: Can cu tai lieu thu thap duoc, ngay 09/03/2022 Co quan canh sat di...
   L1 Masked:  LENH TRUY NA: Can cu tai lieu thu thap duoc, ngay <TIME> Co quan canh sat dieu t...
   L2 Normalized: lenh truy na can cu tai lieu thu thap duoc ngay <TIME> co quan canh sat dieu tra...
   Leet: 1, Leet Words: 1, Density: 0.0034, Sep: 6, Tokens: 64
   Leet Words Found: 1
      - 'N0' -> 'No' (1 leet chars)
---------------------------------

## Test Layer 3

In [11]:
import test_layer3

# Ch·∫°y to√†n b·ªô tests c·ªßa test_layer3.py b·∫±ng pytest
import pytest

result = pytest.main(["test_layer3.py", "-v"])
print("K·∫øt qu·∫£ pytest:", result)

platform win32 -- Python 3.13.5, pytest-9.0.2, pluggy-1.6.0 -- c:\Python313\python.exe
cachedir: .pytest_cache
rootdir: c:\IE403\IE403_DoAnCuoiKy\Smishing\misspell_detection\tests
plugins: anyio-4.12.0
[1mcollecting ... [0mcollected 44 items

test_layer3.py::TestBrandFiltering::test_bank_brand_vcb [32mPASSED[0m[32m           [  2%][0m
test_layer3.py::TestBrandFiltering::test_bank_brand_bidv [32mPASSED[0m[32m          [  4%][0m
test_layer3.py::TestBrandFiltering::test_bank_brand_vietinbank [32mPASSED[0m[32m    [  6%][0m
test_layer3.py::TestBrandFiltering::test_ewallet_brand_momo [32mPASSED[0m[32m       [  9%][0m
test_layer3.py::TestBrandFiltering::test_telco_brand_viettel [32mPASSED[0m[32m      [ 11%][0m
test_layer3.py::TestBrandFiltering::test_app_brand_tiktok [32mPASSED[0m[32m         [ 13%][0m
test_layer3.py::TestJargonFiltering::test_jargon_otp [32mPASSED[0m[32m              [ 15%][0m
test_layer3.py::TestJargonFiltering::test_jargon_sim [32mPASSED[0m

In [12]:
# ============================================================
# FULL PIPELINE: LAYER 1 ‚Üí LAYER 2 ‚Üí LAYER 3
# ============================================================

import pandas as pd
import sys
import json
from pathlib import Path

# Setup paths
ROOT_DIR = Path.cwd().parent.parent.parent  # IE403_DoAnCuoiKy/
sys.path.insert(0, str(ROOT_DIR))

from Smishing.preprocessing.layer1_masking import AggressiveMasker
from Smishing.misspell_detection.layer2_normalization import TextNormalizer
from Smishing.misspell_detection.layer3_whitelist import WhitelistFilter
from Smishing.data_loader import load_dataset

# Load dataset
DATA_PATH = ROOT_DIR / "data" / "dataset.csv"
OUTPUT_PATH = Path.cwd() / "layer3_whitelist_results.csv"

print(f"üìÇ Loading dataset from: {DATA_PATH}")
df = load_dataset(DATA_PATH)
print(f"‚úÖ Loaded {len(df):,} rows")

# Initialize all processors
masker = AggressiveMasker()
normalizer = TextNormalizer()
whitelist_filter = WhitelistFilter()

print(f"\n‚úì Layer 1: AggressiveMasker initialized")
print(f"‚úì Layer 2: TextNormalizer initialized")
print(f"‚úì Layer 3: WhitelistFilter initialized ({len(whitelist_filter.whitelist)} whitelist items)")

# Process all rows with FULL PIPELINE
print(f"\nüîÑ Processing {len(df):,} rows through Layer 1 ‚Üí Layer 2 ‚Üí Layer 3...")
results = []

for idx, row in df.iterrows():
    content = str(row.get("content", ""))
    label = row.get("label", "")
    
    try:
        # ===== LAYER 1: MASKING =====
        masked_text, mask_metadata = masker.mask(content)
        mask_counts = masker.get_entity_counts(mask_metadata)
        
        # ===== LAYER 2: NORMALIZATION =====
        norm_result = normalizer.normalize(masked_text)
        tokens = norm_result.tokens
        
        # ===== LAYER 3: WHITELIST FILTERING =====
        whitelist_result = whitelist_filter.filter(tokens)
        
    except Exception as e:
        print(f"Error at row {idx}: {e}")
        masked_text = content
        norm_result = None
        whitelist_result = None
        tokens = []
        mask_counts = {}
    
    # Build result row
    result = {
        "index": idx,
        "label": label,
        "original_content": content,
        # Layer 1
        "layer1_masked": masked_text,
        "url_count": mask_counts.get("url", 0) + mask_counts.get("zalo", 0) + mask_counts.get("telegram", 0),
        "phone_count": mask_counts.get("hotline", 0) + mask_counts.get("landline", 0) + 
                      mask_counts.get("mobile", 0) + mask_counts.get("shortcode", 0),
        "money_count": mask_counts.get("money", 0),
        "code_count": mask_counts.get("code", 0),
        # Layer 2
        "layer2_normalized": norm_result.normalized_text if norm_result else "",
        "layer2_tokens": str(tokens),
        "token_count": len(tokens),
        "leet_count": norm_result.leet_count if norm_result else 0,
        "leet_word_count": norm_result.leet_word_count if norm_result else 0,
        "leet_density": norm_result.leet_density if norm_result else 0.0,
        "leet_words": json.dumps(norm_result.leet_words, ensure_ascii=False) if norm_result and norm_result.leet_words else "[]",
        "leet_patterns_used": json.dumps(norm_result.leet_patterns_used, ensure_ascii=False) if norm_result and norm_result.leet_patterns_used else "{}",
        "separator_count": norm_result.separator_count if norm_result else 0,
        # Layer 3
        "tokens_to_check": str(whitelist_result.tokens_to_check) if whitelist_result else "[]",
        "whitelisted_tokens": str(whitelist_result.whitelisted_tokens) if whitelist_result else "[]",
        "whitelist_count": whitelist_result.whitelist_count if whitelist_result else 0,
        "tokens_to_check_count": len(whitelist_result.tokens_to_check) if whitelist_result else 0,
    }
    results.append(result)
    
    if (idx + 1) % 500 == 0:
        print(f"   Processed {idx + 1:,} / {len(df):,} rows...")

# Save results
result_df = pd.DataFrame(results)
result_df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8-sig")

print(f"\n‚úÖ Results saved to: {OUTPUT_PATH}")
print(f"   Total rows: {len(result_df):,}")

üìÇ Loading dataset from: c:\IE403\IE403_DoAnCuoiKy\data\dataset.csv
‚úÖ Loaded 2,603 rows from dataset.csv (standard parser)
‚úÖ Loaded 2,603 rows

‚úì Layer 1: AggressiveMasker initialized
‚úì Layer 2: TextNormalizer initialized
‚úì Layer 3: WhitelistFilter initialized (124 whitelist items)

üîÑ Processing 2,603 rows through Layer 1 ‚Üí Layer 2 ‚Üí Layer 3...
   Processed 500 / 2,603 rows...
   Processed 1,000 / 2,603 rows...
   Processed 1,500 / 2,603 rows...
   Processed 2,000 / 2,603 rows...
   Processed 2,500 / 2,603 rows...

‚úÖ Results saved to: c:\IE403\IE403_DoAnCuoiKy\Smishing\misspell_detection\tests\layer3_whitelist_results.csv
   Total rows: 2,603


In [13]:
# ============================================================
# LAYER 3 SUMMARY STATISTICS
# ============================================================

print("\nüìä FULL PIPELINE SUMMARY STATISTICS:")
print("=" * 60)

# Layer 1 Stats
print("\nüîí LAYER 1 - ENTITY MASKING:")
print(f"   URLs detected:      {result_df['url_count'].sum():,}")
print(f"   Phones detected:    {result_df['phone_count'].sum():,}")
print(f"   Money detected:     {result_df['money_count'].sum():,}")
print(f"   Codes detected:     {result_df['code_count'].sum():,}")

# Layer 2 Stats
print(f"\nüî§ LAYER 2 - NORMALIZATION:")
print(f"   Total tokens:       {result_df['token_count'].sum():,}")
print(f"   Avg tokens/msg:     {result_df['token_count'].mean():.1f}")
print(f"   Total leet chars:   {result_df['leet_count'].sum():,}")
print(f"   Total leet words:   {result_df['leet_word_count'].sum():,}")
print(f"   Avg leet words/msg: {result_df['leet_word_count'].mean():.2f}")
print(f"   Avg leet density:   {result_df['leet_density'].mean():.4f}")
print(f"   Total separators:   {result_df['separator_count'].sum():,}")

# Layer 3 Stats
print(f"\nüìã LAYER 3 - WHITELIST FILTERING:")
print(f"   Total whitelist:    {result_df['whitelist_count'].sum():,}")
print(f"   Total to check:     {result_df['tokens_to_check_count'].sum():,}")
print(f"   Avg whitelist/msg:  {result_df['whitelist_count'].mean():.2f}")
print(f"   Avg to check/msg:   {result_df['tokens_to_check_count'].mean():.2f}")

# Filtering ratio
total_tokens = result_df['token_count'].sum()
tokens_filtered = result_df['whitelist_count'].sum()
tokens_remaining = result_df['tokens_to_check_count'].sum()

print(f"\nüìà FILTERING EFFICIENCY:")
print(f"   Total tokens input:       {total_tokens:,}")
print(f"   Tokens filtered out:      {tokens_filtered:,} ({tokens_filtered/total_tokens*100:.1f}%)")
print(f"   Tokens for spell check:   {tokens_remaining:,} ({tokens_remaining/total_tokens*100:.1f}%)")


üìä FULL PIPELINE SUMMARY STATISTICS:

üîí LAYER 1 - ENTITY MASKING:
   URLs detected:      1,395
   Phones detected:    2,151
   Money detected:     2,952
   Codes detected:     1,087

üî§ LAYER 2 - NORMALIZATION:
   Total tokens:       114,904
   Avg tokens/msg:     44.1
   Total leet chars:   19
   Total leet words:   19
   Avg leet words/msg: 0.01
   Avg leet density:   0.0001
   Total separators:   24,360

üìã LAYER 3 - WHITELIST FILTERING:
   Total whitelist:    20,789
   Total to check:     94,115
   Avg whitelist/msg:  7.99
   Avg to check/msg:   36.16

üìà FILTERING EFFICIENCY:
   Total tokens input:       114,904
   Tokens filtered out:      20,789 (18.1%)
   Tokens for spell check:   94,115 (81.9%)


In [14]:
# ============================================================
# COMPARISON BY LABEL
# ============================================================

print("\nüìä COMPARISON BY LABEL:")
print("=" * 60)

for label in [1, 0]:
    subset = result_df[result_df['label'] == label]
    label_name = "SPAM" if label == 1 else "HAM"
    
    print(f"\n{'üö®' if label == 1 else '‚úÖ'} {label_name} (label={label}): {len(subset):,} messages")
    print("-" * 40)
    print(f"   Avg tokens:           {subset['token_count'].mean():.1f}")
    print(f"   Avg leet chars:       {subset['leet_count'].mean():.2f}")
    print(f"   Avg leet words:       {subset['leet_word_count'].mean():.2f}")
    print(f"   Avg leet density:     {subset['leet_density'].mean():.4f}")
    print(f"   Avg separators:       {subset['separator_count'].mean():.2f}")
    print(f"   Avg whitelist count:  {subset['whitelist_count'].mean():.2f}")
    print(f"   Avg tokens to check:  {subset['tokens_to_check_count'].mean():.2f}")
    
    # Whitelist ratio
    total = subset['token_count'].sum()
    filtered = subset['whitelist_count'].sum()
    print(f"   Whitelist ratio:      {filtered/total*100:.1f}%")


üìä COMPARISON BY LABEL:

üö® SPAM (label=1): 278 messages
----------------------------------------
   Avg tokens:           40.0
   Avg leet chars:       0.06
   Avg leet words:       0.06
   Avg leet density:     0.0005
   Avg separators:       7.21
   Avg whitelist count:  5.27
   Avg tokens to check:  34.75
   Whitelist ratio:      13.2%

‚úÖ HAM (label=0): 2,325 messages
----------------------------------------
   Avg tokens:           44.6
   Avg leet chars:       0.00
   Avg leet words:       0.00
   Avg leet density:     0.0000
   Avg separators:       9.62
   Avg whitelist count:  8.31
   Avg tokens to check:  36.32
   Whitelist ratio:      18.6%


In [15]:
# ============================================================
# SAMPLE RESULTS - FULL PIPELINE
# ============================================================

import json

print("\nüìã SAMPLE RESULTS (Layer 1 ‚Üí Layer 2 ‚Üí Layer 3):")
print("=" * 80)

# Show samples with interesting whitelist filtering
samples = result_df[result_df['whitelist_count'] > 0].head(5)

for _, row in samples.iterrows():
    print(f"\n[{row['index']}] Label: {'SPAM' if row['label']==1 else 'HAM'}")
    print(f"   Original:    {row['original_content'][:70]}...")
    print(f"   L1 Masked:   {row['layer1_masked'][:70]}...")
    print(f"   L2 Tokens:   {row['layer2_tokens'][:70]}...")
    print(f"   ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")
    print(f"   üîç Whitelisted ({row['whitelist_count']}): {row['whitelisted_tokens'][:60]}...")
    print(f"   ‚úèÔ∏è  To check ({row['tokens_to_check_count']}):   {row['tokens_to_check'][:60]}...")
    
    # Show leet information if available
    if row['leet_count'] > 0:
        print(f"   üî§ Leet Info: {row['leet_count']} chars, {row['leet_word_count']} words, density: {row['leet_density']:.4f}")
        if row['leet_words'] and row['leet_words'] != "[]":
            try:
                leet_words = json.loads(row['leet_words'])
                if leet_words:
                    print(f"      Leet words: {', '.join([f"\'{w.get('original', '')}\'‚Üí\'{w.get('decoded', '')}\'" for w in leet_words[:3]])}")
            except:
                pass
    print("-" * 80)


üìã SAMPLE RESULTS (Layer 1 ‚Üí Layer 2 ‚Üí Layer 3):

[0] Label: SPAM
   Original:    [TRUNG T√ÇM PH√íNG CH·ªêNG GIAN L·∫¨N NG√ÇN H√ÄNG] √îng/B√† Nguy·ªÖn VƒÉn Minh Tr∆∞·ªõ...
   L1 Masked:   [TRUNG T√ÇM PH√íNG CH·ªêNG GIAN L·∫¨N NG√ÇN H√ÄNG] √îng/B√† Nguy·ªÖn VƒÉn Minh Tr∆∞·ªõ...
   L2 Tokens:   ['trung', 't√¢m', 'ph√≤ng', 'ch·ªëng', 'gian', 'l·∫≠n', 'ng√¢n', 'h√†ng', '√¥ng...
   ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
   üîç Whitelisted (5): ['<TIME>', '<MONEY>', 'tp', 'hcm', '<MONEY>']...
   ‚úèÔ∏è  To check (83):   ['trung', 't√¢m', 'ph√≤ng', 'ch·ªëng', 'gian', 'l·∫≠n', 'ng√¢n', 'h...
--------------------------------------------------------------------------------

[1] Label: SPAM
   Original:    [TB] Tien ich Loi nhan thoai cua Viettel: Quy khach co loi nhan tu TB ...
   L1 Masked:   [TB] Tien ich Loi nhan thoai cua Viettel: Quy khach co loi nhan tu TB ...
   L2 Tokens:   ['tb', 'tien', 'ich', 

In [16]:
# ============================================================
# VISUAL: SINGLE MESSAGE FLOW
# ============================================================

def show_pipeline_flow(text, masker, normalizer, whitelist_filter):
    """Hi·ªÉn th·ªã chi ti·∫øt flow x·ª≠ l√Ω 1 message"""
    print("=" * 70)
    print("üì® ORIGINAL INPUT:")
    print(f"   {text}")
    
    # Layer 1
    masked, meta = masker.mask(text)
    print("\nüîí LAYER 1 - MASKING:")
    print(f"   {masked}")
    print(f"   Entities: {meta}")
    
    # Layer 2
    norm = normalizer.normalize(masked)
    print("\nüî§ LAYER 2 - NORMALIZATION:")
    print(f"   Text: {norm.normalized_text}")
    print(f"   Tokens: {norm.tokens}")
    print(f"   Leet: {norm.leet_count}, Sep: {norm.separator_count}")
    
    # Layer 3
    result = whitelist_filter.filter(norm.tokens)
    print("\nüìã LAYER 3 - WHITELIST FILTERING:")
    print(f"   ‚úÖ Whitelisted ({result.whitelist_count}): {result.whitelisted_tokens}")
    print(f"   ‚úèÔ∏è  To check ({len(result.tokens_to_check)}):    {result.tokens_to_check}")
    print("=" * 70)

# Test v·ªõi m·ªôt s·ªë samples
test_messages = [
    "VCB: T√†i kho·∫£n c·ªßa b·∫°n ƒë√£ ƒë∆∞·ª£c k√≠ch ho·∫°t OTP. Truy c·∫≠p https://vcb-fake.com",
    "Th0ng ba0: BIDV nang cap he thong. Vui l0ng dang nhap ngay!",
    "Ch√†o b·∫°n, d·∫°o n√†y kh·ªèe kh√¥ng? L√¢u r·ªìi kh√¥ng g·∫∑p.",
]

for msg in test_messages:
    show_pipeline_flow(msg, masker, normalizer, whitelist_filter)
    print("\n")

üì® ORIGINAL INPUT:
   VCB: T√†i kho·∫£n c·ªßa b·∫°n ƒë√£ ƒë∆∞·ª£c k√≠ch ho·∫°t OTP. Truy c·∫≠p https://vcb-fake.com

üîí LAYER 1 - MASKING:
   VCB: T√†i kho·∫£n c·ªßa b·∫°n ƒë√£ ƒë∆∞·ª£c k√≠ch ho·∫°t OTP. Truy c·∫≠p <URL>
   Entities: {'url': ['https://vcb-fake.com']}

üî§ LAYER 2 - NORMALIZATION:
   Text: vcb t√†i kho·∫£n c·ªßa b·∫°n ƒë√£ ƒë∆∞·ª£c k√≠ch ho·∫°t otp truy c·∫≠p <URL>
   Tokens: ['vcb', 't√†i', 'kho·∫£n', 'c·ªßa', 'b·∫°n', 'ƒë√£', 'ƒë∆∞·ª£c', 'k√≠ch', 'ho·∫°t', 'otp', 'truy', 'c·∫≠p', '<URL>']
   Leet: 0, Sep: 2

üìã LAYER 3 - WHITELIST FILTERING:
   ‚úÖ Whitelisted (3): ['vcb', 'otp', '<URL>']
   ‚úèÔ∏è  To check (10):    ['t√†i', 'kho·∫£n', 'c·ªßa', 'b·∫°n', 'ƒë√£', 'ƒë∆∞·ª£c', 'k√≠ch', 'ho·∫°t', 'truy', 'c·∫≠p']


üì® ORIGINAL INPUT:
   Th0ng ba0: BIDV nang cap he thong. Vui l0ng dang nhap ngay!

üîí LAYER 1 - MASKING:
   Th0ng ba0: BIDV nang cap he thong. Vui l0ng dang nhap ngay!
   Entities: {}

üî§ LAYER 2 - NORMALIZATION:
   Text: thong bao bidv nang cap he