### **Understanding the Columns**

In [None]:
# Sample a few URLs to manually verify what's extractable
print("\n" + "=" * 60)
print("SAMPLE URL INSPECTION")
print("=" * 60)

# Get 3 phishing and 3 legitimate URLs
phish_samples = df[df["label"] == 0].head(3)
legit_samples = df[df["label"] == 1].head(3)


def inspect_url(row):
    """Show what we can extract from URL alone"""
    url = row["URL"]
    print(f"\nURL: {url}")
    print(f"Label: {'PHISH' if row['label'] == 0 else 'LEGIT'}")
    print(f"-" * 60)

    # URL-only features
    print(f"  URLLength (dataset): {row['URLLength']}")
    print(f"  url_len (computed):  {len(url)}")
    print(f"  Match: {row['URLLength'] == len(url)}")

    print(f"\n  DomainLength (dataset): {row['DomainLength']}")
    print(f"  Domain (dataset): {row['Domain']}")

    print(f"\n  NoOfSubDomain (dataset): {row['NoOfSubDomain']}")
    print(f"  TLD: {row['TLD']}")
    print(f"  TLDLength: {row['TLDLength']}")

    print(f"\n  IsHTTPS: {row['IsHTTPS']}")
    print(f"  IsDomainIP: {row['IsDomainIP']}")

    print(f"\n  Character features:")
    print(f"    NoOfLettersInURL: {row['NoOfLettersInURL']}")
    print(f"    NoOfDegitsInURL: {row['NoOfDegitsInURL']}")
    print(f"    DegitRatioInURL: {row['DegitRatioInURL']:.3f}")
    print(f"    NoOfOtherSpecialCharsInURL: {row['NoOfOtherSpecialCharsInURL']}")

    # Page-content features (should vary widely, show they need fetch)
    print(f"\n  Page-content features (require fetch):")
    print(f"    LineOfCode: {row['LineOfCode']}")
    print(f"    HasTitle: {row['HasTitle']}")
    print(f"    NoOfImage: {row['NoOfImage']}")


print("\n🎣 PHISHING SAMPLES:")
for idx, row in phish_samples.iterrows():
    inspect_url(row)

print("\n" + "=" * 60)
print("\n LEGITIMATE SAMPLES:")
for idx, row in legit_samples.iterrows():
    inspect_url(row)


- URLLENGTH doesnt have consistent characters.

In [None]:
print("\n" + "=" * 60)
print("URL LENGTH DISCREPANCY INVESTIGATION")
print("=" * 60)

# Check first 20 rows systematically
mismatches = []

for idx in range(20):
    row = df.iloc[idx]
    url = row["URL"]
    dataset_len = row["URLLength"]
    computed_len = len(url)
    diff = computed_len - dataset_len

    if diff != 0:
        mismatches.append(
            {
                "idx": idx,
                "url": url,
                "dataset_len": dataset_len,
                "computed_len": computed_len,
                "diff": diff,
            }
        )

print(f"\nFound {len(mismatches)} mismatches in first 20 rows\n")

for m in mismatches[:5]:  # Show first 5
    print(f"[{m['idx']}] Diff: {m['diff']}")
    print(f"  URL: {m['url']}")
    print(f"  Dataset: {m['dataset_len']} | Computed: {m['computed_len']}")
    print()

# Full dataset check
df["computed_url_len"] = df["URL"].apply(len)
df["length_diff"] = df["computed_url_len"] - df["URLLength"]

print("=" * 60)
print("FULL DATASET SUMMARY")
print("=" * 60)
print(f"\nDifference distribution:")
print(df["length_diff"].value_counts().sort_index().head(10))

print(f"\nPercentage exact match: {(df['length_diff'] == 0).mean() * 100:.2f}%")
print(f"Percentage off by +1: {(df['length_diff'] == 1).mean() * 100:.2f}%")
print(f"Percentage off by -1: {(df['length_diff'] == -1).mean() * 100:.2f}%")
print(f"Percentage off by 2+: {(df['length_diff'].abs() >= 2).mean() * 100:.2f}%")

# Check if it's consistent per protocol
print("\n" + "=" * 60)
print("BY PROTOCOL")
print("=" * 60)
df["protocol"] = df["URL"].str.extract(r"^(https?://)")
print(df.groupby("protocol")["length_diff"].value_counts())


- Analyze

In [None]:
print("\n" + "=" * 60)
print("ANALYZING THE 20% THAT MATCH EXACTLY")
print("=" * 60)

# Get samples of exact matches
exact_matches = df[df["length_diff"] == 0].head(10)
off_by_one = df[df["length_diff"] == 1].head(10)

print("\n✅ EXACT MATCHES (Dataset URLLength = Computed):")
for idx, row in exact_matches.iterrows():
    url = row["URL"]
    print(f"  {url} (len={len(url)})")

print("\n❌ OFF BY ONE (Dataset URLLength = Computed - 1):")
for idx, row in off_by_one.iterrows():
    url = row["URL"]
    print(f"  {url} (len={len(url)})")

# Look for patterns
print("\n" + "=" * 60)
print("PATTERN ANALYSIS")
print("=" * 60)

# Check if trailing slashes matter
df["has_trailing_slash"] = df["URL"].str.endswith("/")
print("\nTrailing slash distribution:")
print(df.groupby("has_trailing_slash")["length_diff"].value_counts())

# Check if query strings matter
df["has_query"] = df["URL"].str.contains(r"\?")
print("\nQuery string distribution:")
print(df.groupby("has_query")["length_diff"].value_counts())

# Check URL length ranges
print("\nLength difference by URL length bucket:")
df["url_len_bucket"] = pd.cut(
    df["computed_url_len"], bins=[0, 30, 50, 100, 200, 500, 10000]
)
print(df.groupby("url_len_bucket")["length_diff"].value_counts().head(20))


In [None]:
print("\n" + "=" * 60)
print("VERIFY OTHER ENGINEERED FEATURES")
print("=" * 60)

# Check if url_digit_ratio matches DegitRatioInURL
# First, compute it fresh
df["computed_digit_ratio"] = df["URL"].apply(
    lambda url: sum(c.isdigit() for c in url) / len(url) if url else 0
)

# Compare
df["digit_ratio_diff"] = (df["computed_digit_ratio"] - df["DegitRatioInURL"]).abs()

print("\nDigit Ratio Comparison:")
print(
    f"  Exact matches: {(df['digit_ratio_diff'] < 0.001).sum():,} ({(df['digit_ratio_diff'] < 0.001).mean() * 100:.2f}%)"
)
print(
    f"  Close (< 0.01): {(df['digit_ratio_diff'] < 0.01).sum():,} ({(df['digit_ratio_diff'] < 0.01).mean() * 100:.2f}%)"
)
print(f"  Max difference: {df['digit_ratio_diff'].max():.6f}")

# Sample of matches
print("\nSample digit ratio comparison:")
for idx in range(5):
    row = df.iloc[idx]
    print(f"  URL: {row['URL'][:50]}...")
    print(
        f"    Dataset: {row['DegitRatioInURL']:.4f} | Computed: {row['computed_digit_ratio']:.4f}"
    )


# Check subdomain count
# Extract from URL
def compute_subdomains(url):
    if not isinstance(url, str) or not url:
        return 0
    host = url.split("://", 1)[-1].split("/", 1)[0]
    return max(0, host.count(".") - 1)


df["computed_subdomains"] = df["URL"].apply(compute_subdomains)
df["subdomain_diff"] = (df["computed_subdomains"] - df["NoOfSubDomain"]).abs()

print("\n" + "=" * 60)
print("Subdomain Count Comparison:")
print(
    f"  Exact matches: {(df['subdomain_diff'] == 0).sum():,} ({(df['subdomain_diff'] == 0).mean() * 100:.2f}%)"
)
print(
    f"  Off by 1: {(df['subdomain_diff'] == 1).sum():,} ({(df['subdomain_diff'] == 1).mean() * 100:.2f}%)"
)
print(f"  Max difference: {df['subdomain_diff'].max()}")

# Sample
print("\nSample subdomain comparison:")
for idx in range(5):
    row = df.iloc[idx]
    print(f"  URL: {row['URL'][:50]}...")
    print(
        f"    Dataset: {row['NoOfSubDomain']} | Computed: {row['computed_subdomains']}"
    )


### **Remove Duplicates**

In [None]:
print("\n" + "=" * 60)
print("REMOVING DUPLICATE URLs")
print("=" * 60)

print(f"\nBefore deduplication:")
print(f"  Total rows: {len(df):,}")
print(f"  Unique URLs: {df['URL'].nunique():,}")

# Keep first occurrence of each URL
df_deduped = df.drop_duplicates(subset=["URL"], keep="first").reset_index(drop=True)

print(f"\nAfter deduplication:")
print(f"  Total rows: {len(df_deduped):,}")
print(f"  Unique URLs: {df_deduped['URL'].nunique():,}")
print(f"  Removed: {len(df) - len(df_deduped):,} duplicate rows")

# Verify label distribution didn't change significantly
print(f"\nLabel distribution before:")
print(df["label"].value_counts())
print(f"\nLabel distribution after:")
print(df_deduped["label"].value_counts())

# Update working dataframe
df = df_deduped.copy()

print("\n✓ Working with deduplicated dataset from now on")
