## Test 1 - Masking Layer

In [13]:
import test_layer1

In [14]:
# Ch·∫°y to√†n b·ªô tests c·ªßa test_layer1.py b·∫±ng pytest
import pytest

result = pytest.main(["test_layer1.py", "-v"])
print("K·∫øt qu·∫£ pytest:", result)


platform win32 -- Python 3.13.5, pytest-9.0.2, pluggy-1.6.0 -- c:\Python313\python.exe
cachedir: .pytest_cache
rootdir: c:\IE403\IE403_DoAnCuoiKy\Smishing\misspell_detection\tests
plugins: anyio-4.12.0
[1mcollecting ... [0mcollected 39 items

test_layer1.py::TestURLMasking::test_standard_url_with_https [32mPASSED[0m[32m      [  2%][0m
test_layer1.py::TestURLMasking::test_standard_url_with_www [32mPASSED[0m[32m        [  5%][0m
test_layer1.py::TestURLMasking::test_url_shortener_bitly [32mPASSED[0m[32m          [  7%][0m
test_layer1.py::TestURLMasking::test_aggressive_url_with_spaces [32mPASSED[0m[32m   [ 10%][0m
test_layer1.py::TestURLMasking::test_spam_tld_icu [32mPASSED[0m[32m                 [ 12%][0m
test_layer1.py::TestURLMasking::test_spam_tld_vip [32mPASSED[0m[32m                 [ 15%][0m
test_layer1.py::TestURLMasking::test_multiple_urls [32mPASSED[0m[32m                [ 17%][0m
test_layer1.py::TestZaloTelegramMasking::test_zalo_link [32mPASSED[

In [15]:
import pandas as pd
import sys
from pathlib import Path

# Setup paths
ROOT_DIR = Path.cwd().parent.parent.parent  # IE403_DoAnCuoiKy/
sys.path.insert(0, str(ROOT_DIR))

from Smishing.misspell_detection.layer1_masking import AggressiveMasker
from Smishing.data_loader import load_dataset

# Load dataset
DATA_PATH = ROOT_DIR / "data" / "dataset.csv"
OUTPUT_PATH = Path.cwd() / "layer1_masking_results.csv"

print(f"üìÇ Loading dataset from: {DATA_PATH}")
df = load_dataset(DATA_PATH)
print(f"‚úÖ Loaded {len(df):,} rows")

# Initialize masker
masker = AggressiveMasker()

# Process all rows
print(f"\nüîÑ Processing {len(df):,} rows...")
results = []

for idx, row in df.iterrows():
    content = str(row.get("content", ""))
    label = row.get("label", "")
    
    try:
        masked_text, metadata = masker.mask(content)
        counts = masker.get_entity_counts(metadata)
    except Exception as e:
        masked_text = f"ERROR: {e}"
        metadata = {}
        counts = {}
    
    result = {
        "index": idx,
        "label": label,
        "original_content": content,
        "masked_content": masked_text,
        "url_count": counts.get("url", 0) + counts.get("zalo", 0) + counts.get("telegram", 0),
        "phone_count": counts.get("hotline", 0) + counts.get("landline", 0) + 
                      counts.get("mobile", 0) + counts.get("shortcode", 0),
        "money_count": counts.get("money", 0),
        "code_count": counts.get("code", 0),
        "email_count": counts.get("email", 0),
        "datetime_count": counts.get("datetime", 0),
        "raw_metadata": str(metadata),
    }
    results.append(result)
    
    if (idx + 1) % 500 == 0:
        print(f"   Processed {idx + 1:,} / {len(df):,} rows...")

# Save results
result_df = pd.DataFrame(results)
result_df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8-sig")

print(f"\n‚úÖ Results saved to: {OUTPUT_PATH}")
print(f"   Total rows: {len(result_df):,}")

üìÇ Loading dataset from: c:\IE403\IE403_DoAnCuoiKy\data\dataset.csv
‚úÖ Loaded 2,603 rows from dataset.csv (standard parser)
‚úÖ Loaded 2,603 rows

üîÑ Processing 2,603 rows...
   Processed 500 / 2,603 rows...
   Processed 1,000 / 2,603 rows...
   Processed 1,500 / 2,603 rows...
   Processed 2,000 / 2,603 rows...
   Processed 2,500 / 2,603 rows...

‚úÖ Results saved to: c:\IE403\IE403_DoAnCuoiKy\Smishing\misspell_detection\tests\layer1_masking_results.csv
   Total rows: 2,603


In [16]:
# Summary statistics
print("\nüìä SUMMARY STATISTICS:")
print("-" * 50)
print(f"   URLs detected:      {result_df['url_count'].sum():,}")
print(f"   Phones detected:    {result_df['phone_count'].sum():,}")
print(f"   Money detected:     {result_df['money_count'].sum():,}")
print(f"   Codes detected:     {result_df['code_count'].sum():,}")
print(f"   Emails detected:    {result_df['email_count'].sum():,}")
print(f"   DateTimes detected: {result_df['datetime_count'].sum():,}")

# Rows with at least one entity
has_entity = result_df[['url_count', 'phone_count', 'money_count', 'code_count']].sum(axis=1) > 0
print(f"\n   Rows with entities: {has_entity.sum():,} / {len(result_df):,} ({has_entity.sum()/len(result_df)*100:.1f}%)")


üìä SUMMARY STATISTICS:
--------------------------------------------------
   URLs detected:      1,395
   Phones detected:    2,151
   Money detected:     2,706
   Codes detected:     1,925
   Emails detected:    7
   DateTimes detected: 2,742

   Rows with entities: 2,311 / 2,603 (88.8%)


In [17]:
# Show sample results
print("\nüìã SAMPLE RESULTS (first 10 rows with changes):")
print("=" * 80)

# Filter rows that have changes
changed_rows = result_df[result_df['original_content'] != result_df['masked_content']]

for _, row in changed_rows.head(10).iterrows():
    print(f"\n[{row['index']}] Label: {row['label']}")
    print(f"   Original: {row['original_content'][:100]}...")
    print(f"   Masked:   {row['masked_content'][:100]}...")
    print(f"   Counts:   URL={row['url_count']}, Phone={row['phone_count']}, Money={row['money_count']}, Code={row['code_count']}")


üìã SAMPLE RESULTS (first 10 rows with changes):

[0] Label: 1
   Original: [TRUNG T√ÇM PH√íNG CH·ªêNG GIAN L·∫¨N NG√ÇN H√ÄNG] √îng/B√† Nguy·ªÖn VƒÉn Minh Tr∆∞·ªõc 17h ng√†y h√¥m nay kh√¥ng thanh...
   Masked:   [TRUNG T√ÇM PH√íNG CH·ªêNG GIAN L·∫¨N NG√ÇN H√ÄNG] √îng/B√† Nguy·ªÖn VƒÉn Minh Tr∆∞·ªõc <TIME> ng√†y h√¥m nay kh√¥ng th...
   Counts:   URL=0, Phone=0, Money=2, Code=0

[1] Label: 1
   Original: [TB] Tien ich Loi nhan thoai cua Viettel: Quy khach co loi nhan tu TB 0848836182 vao luc 08:09 27/03...
   Masked:   [TB] Tien ich Loi nhan thoai cua Viettel: Quy khach co loi nhan tu TB <PHONE> vao luc <TIME> <TIME>....
   Counts:   URL=0, Phone=2, Money=1, Code=0

[2] Label: 1
   Original: Western Union TB: Vietcombank: 0071000986547. Tr·∫ßn Th·ªã Lan. Ref +19.56 USD. Nh·∫≠n 500.000 VND. Ngay 0...
   Masked:   Western Union TB: Vietcombank: 0071000986547. Tr·∫ßn Th·ªã Lan. Ref +<MONEY>. Nh·∫≠n <MONEY>. Ngay <TIME>....
   Counts:   URL=1, Phone=0, Money=2, Code=0

[3] Label: 1
   Ori

In [18]:
import re

# Test cleanup pattern
def test_cleanup():
    test_cases = [
        "www.<URL>",
        "m.<URL>",
        "cdn.static.<URL>",
        "vao www.<URL> de",
        "bam m.<URL> lay",
    ]
    
    cleanup_pattern = r'(?i)[a-z0-9-]+[\s\u00A0]*\.[\s\u00A0]*(<URL>)'
    
    for text in test_cases:
        original = text
        prev = None
        while prev != text:
            prev = text
            text = re.sub(cleanup_pattern, r'\1', text)
        print(f"'{original}' -> '{text}'")

test_cleanup()

'www.<URL>' -> '<URL>'
'm.<URL>' -> '<URL>'
'cdn.static.<URL>' -> '<URL>'
'vao www.<URL> de' -> 'vao <URL> de'
'bam m.<URL> lay' -> 'bam <URL> lay'
