In [1]:
import shutil

import kagglehub
import pandas as pd
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_path = "data/phishing_site_urls_cleaned.csv"

if not os.path.exists(dataset_path):
    downloaded_dataset_path = kagglehub.dataset_download("taruntiwarihp/phishing-site-urls")

    shutil.copytree(downloaded_dataset_path, "data", dirs_exist_ok=True)

    print("Dataset downloaded successfully")

In [3]:
df = pd.read_csv(dataset_path)

# rename every column to lowercase
df.columns = df.columns.str.lower()

df.rename(columns={"label": "is_phishing"}, inplace=True)
df.head()

Unnamed: 0,url,is_phishing,has_ip,url_length,has_at,dot_count,num_digits,domain_length,num_dots,num_hyphens,num_underscores,num_slashes,num_spaces
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,1,False,225,False,6,58,15.0,6,4,4,10,0
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,1,False,81,False,5,1,9.0,5,2,1,4,0
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,1,False,177,False,7,47,4.0,7,1,0,11,0
3,mail.printakid.com/www.online.americanexpress....,1,False,60,False,6,0,10.0,6,0,0,2,0
4,thewhiskeydregs.com/wp-content/themes/widescre...,1,False,116,False,1,21,6.0,1,1,0,10,0


In [4]:
suspicious_words = [
    # Authentication & Account
    'login', 'signin', 'signup', 'register', 'password', 'credential', 'verification', 'authenticate',
    'account', 'profile', 'settings', 'recover', 'reset', 'confirm', 'validation',

    # Banking & Finance
    'bank', 'secure', 'wallet', 'payment', 'pay', 'billing', 'invoice', 'paypal', 'stripe', 'visa',
    'mastercard', 'crypto', 'bitcoin', 'finance', 'transfer', 'withdraw', 'deposit',

    # Urgency & Action
    'update', 'required', 'action', 'urgent', 'alert', 'notification', 'limited', 'access',
    'suspended', 'locked', 'security', 'verify', 'now', 'free', 'win', 'prize',

    # Common Targets
    'google', 'icloud', 'apple', 'microsoft', 'facebook', 'instagram', 'whatsapp', 'netflix',
    'twitter', 'linkedin', 'youtube', 'tiktok', 'pinterest', 'reddit', 'github', 'gitlab', 'bitbucket',
    'webmaster', 'admin', 'root', 'support', 'service', 'help', 'contact', 'about', 'terms', 'privacy',

    # Other Suspicious Keywords
    'click', 'click here', 'click here to', 'click here to view', 'click here to download', 'click here to sign',
    'click here to get', 'click here to start', 'click here to continue', 'click here to access', 'click here to view',
    'click here to join', 'click here to register', 'click here to login', 'click here to reset', 'click here to verify',
    'click here to claim', 'click here to redeem', 'click here to activate', 'click here to update', 'click here to upgrade',

    # Suspicious Domains
    'bit.ly', 'tinyurl', 'goo.gl', 'bitly.com', 'tinyurl.com', 'bit.do', 'tiny.cc', 'bit.ly', 'tinyurl', 'goo.gl', 'bitly.com', 'tinyurl.com', 'bit.do', 'tiny.cc',

    # Suspicious Subdomains
    'app', 'mail', 'secure', 'login', 'signin', 'signup', 'register', 'password', 'credential', 'verification', 'authenticate',
    'account', 'profile', 'settings', 'recover', 'reset', 'confirm', 'validation',

    # Brazilian Suspicious words
    'aposta', 'dinheiro', 'ganhar', 'milionario', 'sorte', 'lucro', 'riqueza', 'sucesso', 'vida', 'amor', 'saude', 'familia', 'trabalho', 'carreira', 'dinheiro', 'ganhar', 'milionario', 'sorte', 'lucro', 'riqueza', 'sucesso', 'vida', 'amor', 'saude', 'familia', 'trabalho', 'carreira',
    'loteria', 'lotofacil', 'lotomania', 'megasena', 'quina', 'timemania', 'dupla-sena', 'loteca', 'loterica', 'loterij', 'loteria', 'lotofacil', 'lotomania', 'megasena', 'quina', 'timemania', 'dupla-sena', 'loteca', 'loterica', 'loterij',
]

features_required = [
    'url_length', 'domain_length',
    'dot_count', 'hyphen_count', 'underscore_count', 'slash_count',
    'question_count', 'equal_count', 'at_count',
    'digits_count', 'letters_count',
    'has_ip', 'has_https',
    'digit_letter_ratio', 'suspicious_words', 'is_shortened'
]

df["is_phishing"] = df["is_phishing"].replace({"bad": 1, "good": 0})
df['has_ip'] = df['url'].str.contains(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
df['url_length'] = df['url'].str.len()
df['has_at'] = df['url'].str.contains(r'@')
df['dot_count'] = df['url'].apply(lambda x: str(x).count('.'))
df['num_digits'] = df['url'].apply(lambda x: sum(c.isdigit() for c in x))
df['domain_length'] = df['url'].str.split('/').str[2].str.len()
df['hyphen_count'] = df['url'].apply(lambda x: str(x).count('-'))
df['underscore_count'] = df['url'].apply(lambda x: str(x).count('_'))
df['slash_count'] = df['url'].apply(lambda x: str(x).count('/'))
df['question_count'] = df['url'].apply(lambda x: str(x).count('?'))
df['equal_count'] = df['url'].apply(lambda x: str(x).count('='))
df['at_count'] = df['url'].apply(lambda x: str(x).count('@'))
df['digits_count'] = df['url'].apply(lambda x: sum(c.isdigit() for c in x))
df['letters_count'] = df['url'].apply(lambda x: sum(c.isalpha() for c in x))
df['digit_letter_ratio'] = df['digits_count'] / df['letters_count']
df['suspicious_words'] = df['url'].apply(lambda x: sum(word in x for word in suspicious_words))
df['is_shortened'] = df['url'].apply(lambda x: 1 if 'bit.ly' in x or 'tinyurl' in x else 0)

df.head()

Unnamed: 0,url,is_phishing,has_ip,url_length,has_at,dot_count,num_digits,domain_length,num_dots,num_hyphens,...,underscore_count,slash_count,question_count,equal_count,at_count,digits_count,letters_count,digit_letter_ratio,suspicious_words,is_shortened
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,1,False,225,False,6,58,15.0,6,4,...,4,10,1,4,0,58,135,0.42963,7,0
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,1,False,81,False,5,1,9.0,5,2,...,1,4,0,2,0,1,65,0.015385,2,0
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,1,False,177,False,7,47,4.0,7,1,...,0,11,0,0,0,47,111,0.423423,5,0
3,mail.printakid.com/www.online.americanexpress....,1,False,60,False,6,0,10.0,6,0,...,0,2,0,0,0,0,52,0.0,1,0
4,thewhiskeydregs.com/wp-content/themes/widescre...,1,False,116,False,1,21,6.0,1,1,...,0,10,1,0,0,21,82,0.256098,0,0


In [5]:
# Clean the dataframe
df = df.drop_duplicates(subset=['url'])

# Remove rows with null values
df = df.dropna(subset=['url'])

In [6]:
# write the dataframe to a csv file
df.to_csv("data/phishing_site_urls_cleaned.csv", index=False)