In [1]:
# %%
# Step 1: Import libraries
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from dotenv import load_dotenv

# %%
# Step 2: Load environment variables
load_dotenv()

pg_user = os.environ['PG_USER']
pg_password = os.environ['PG_PASSWORD']
pg_host = os.environ['PG_HOST']
pg_port = os.environ['PG_PORT']
pg_db = os.environ['PG_DB']

# %%

In [2]:
# Step 3: Create engine
from sqlalchemy import create_engine

engine = create_engine(
    f'postgresql+psycopg2://{pg_user}:{pg_password}@{pg_host}:{pg_port}/{pg_db}'
)


In [3]:
# Step 4: Test a simple read_sql
import pandas as pd

query = '''
SELECT * FROM raw.fact_phishing_urls
LIMIT 5
'''

df_test = pd.read_sql(query, con=engine)
df_test


Unnamed: 0,url,in_database,phishing_status,timestamp,domain_id,feature_id
0,https://dubaipolice.govau.live/,True,Phishing (Verified),2025-04-22 20:07:24,189,
1,https://dubaipolice.govar.live/,True,Phishing (Verified),2025-04-22 20:07:25,47,
2,https://ezpass.com-ezvj.vip/,True,Phishing (Verified),2025-04-22 19:41:23,50,
3,https://fastrak.org-etcmq.vip/,True,Phishing (Verified),2025-04-22 19:45:28,89,
4,https://rb.gy/cf5k2w,True,Not Phishing,2025-04-22 11:15:39,148,


In [5]:
query1 = '''
SELECT 
    d.domain_id,
    COUNT(f.url) AS phishing_url_count
FROM raw.fact_phishing_urls f
JOIN raw.dim_domains d
    ON f.domain_id = d.domain_id
WHERE f.phishing_status = 'Phishing (Verified)'
GROUP BY d.domain_id
ORDER BY phishing_url_count DESC
LIMIT 3
'''

df_q1 = pd.read_sql(query1, con=engine)
df_q1


Unnamed: 0,domain_id,phishing_url_count
0,125,4
1,45,2
2,221,2


In [6]:
query1 = '''
SELECT 
    d.domain_name,
    COUNT(*) AS phishing_url_count
FROM raw.fact_phishing_urls f
JOIN raw.dim_domains d
    ON f.domain_id = d.domain_id
WHERE f.phishing_status = 'Phishing (Verified)'
GROUP BY d.domain_name
ORDER BY phishing_url_count DESC
LIMIT 10

'''

df_q1 = pd.read_sql(query1, con=engine)
df_q1


Unnamed: 0,domain_name,phishing_url_count
0,pracharpath.com,4
1,cxcd.de,2
2,assinatura-mensal.com,2
3,dubaipolice.gov-tollbilln.life,1
4,dubaipolice.gover.live,1
5,e-zpassny.com-tiznqmlq.world,1
6,ezpassnh.com-tollbillnhyd.world,1
7,myrogersscommcable.weebly.com,1
8,ezpass.com-bhv.win,1
9,northwestelwebmaill.weebly.com,1


In [7]:
query1 = '''

WITH feature_summary AS (
    SELECT 
        f.phishing_status,
        u.uses_https,
        u.has_ip_address,
        COUNT(*) AS url_count
    FROM raw.fact_phishing_urls f
    JOIN raw.dim_url_features u
        ON f.feature_id = u.feature_id
    GROUP BY f.phishing_status, u.uses_https, u.has_ip_address
),
ranked_summary AS (
    SELECT *,
        RANK() OVER (
            PARTITION BY phishing_status 
            ORDER BY url_count DESC
        ) AS rank_within_status
    FROM feature_summary
)
SELECT * 
FROM ranked_summary
WHERE rank_within_status <= 3

'''

df_q1 = pd.read_sql(query1, con=engine)
df_q1


Unnamed: 0,phishing_status,uses_https,has_ip_address,url_count,rank_within_status
0,Phishing (Verified),True,False,57,1


### Insight
The most common feature combination among verified phishing URLs (57 instances) includes the use of HTTPS and the absence of an IP address. This indicates that phishing websites often adopt HTTPS to appear more legitimate and trustworthy. Also, the most common verified phishing URLs (57 instances) do **not** contain an IP address. This suggests that attackers are favoring domain-based URLs over raw IP addresses, likely to make the links appear more trustworthy and avoid detection.

### Recommendation
Security awareness training should emphasize that HTTPS does not guarantee a site's legitimacy. Detection tools and user training should not rely solely on spotting IP addresses in URLs as phishing indicators.

### Prediction
As phishing techniques evolve, we can expect an increasing number of phishing sites to use HTTPS by default, reducing the reliability of HTTPS as a standalone trust signal. The trend of using domain-based phishing URLs will continue to rise, as attackers mimic legitimate domain structures to evade user suspicion and automated filters.
