In [2]:
# %%
# Step 1: Import libraries
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from dotenv import load_dotenv

# %%
# Step 2: Load environment variables
load_dotenv()

pg_user = os.environ['PG_USER']
pg_password = os.environ['PG_PASSWORD']
pg_host = os.environ['PG_HOST']
pg_port = os.environ['PG_PORT']
pg_db = os.environ['PG_DB']

# %%

In [3]:
# Step 3: Create engine
from sqlalchemy import create_engine

engine = create_engine(
    f'postgresql+psycopg2://{pg_user}:{pg_password}@{pg_host}:{pg_port}/{pg_db}'
)


## Diagnostic = focuses on pattern across length groups → informs model training logic.

### Business Question
What is the most frequently observed URL length category—_Short_, _Medium_, or _Long_—among **verified phishing URLs**, and how might this pattern inform anti-phishing detection strategies?

In [4]:
query1 = '''
WITH length_buckets AS (
    SELECT 
        CASE 
            WHEN u.url_length < 30 THEN 'Short'
            WHEN u.url_length BETWEEN 30 AND 60 THEN 'Medium'
            ELSE 'Long'
        END AS length_category,
        f.phishing_status,
        COUNT(*) AS url_count
    FROM raw.fact_phishing_urls f
    JOIN raw.dim_url_features u
        ON f.feature_id = u.feature_id
    GROUP BY length_category, f.phishing_status
),
ranked_lengths AS (
    SELECT *,
           RANK() OVER (PARTITION BY phishing_status ORDER BY url_count DESC) AS rank_within_status
    FROM length_buckets
)
SELECT *
FROM ranked_lengths
WHERE phishing_status = 'Phishing (Verified)'
ORDER BY url_count DESC;

'''

df_q1 = pd.read_sql(query1, con=engine)
df_q1


Unnamed: 0,length_category,phishing_status,url_count,rank_within_status
0,Medium,Phishing (Verified),47,1
1,Short,Phishing (Verified),10,2


### Insight
Phishers disproportionately use medium-length URLs (30–60 characters), while avoiding short and long formats.

### Recommendation
Incorporate URL length category as a feature in phishing detection models or machine learning classifiers.

### Prediction
Enhancing models with categorical length data will improve prediction accuracy and reduce overlooked phishing attempts.


## Descriptive = focuses on specific length (32) → informs heuristics.

### Business Question
What are the most common URL lengths among verified phishing websites?

In [5]:
query2 = '''
SELECT
    u.url_length,
    COUNT(*) AS url_count
FROM raw.fact_phishing_urls f
JOIN raw.dim_url_features u 
    ON f.feature_id = u.feature_id
WHERE f.phishing_status = 'Phishing (Verified)'
GROUP BY u.url_length
ORDER BY url_count DESC
LIMIT 7;
'''

df_q2 = pd.read_sql(query2, con=engine)
df_q2


Unnamed: 0,url_length,url_count
0,32,7
1,33,5
2,31,5
3,34,5
4,30,5
5,39,4
6,37,3


### Insight
Most verified phishing URLs are 32 characters long, making it the most common individual URL length among observed phishing data.

### Recommendation
Prioritize monitoring URLs around 32 characters in length when designing detection heuristics.

### Prediction
If length-specific filters are used, phishing link recognition will improve for common formats used by attackers.