In [4]:
import pandas as pd
import numpy as np
import ipaddress
import re
import socket
import ssl
from urllib.parse import urlparse, urljoin
import whois
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import time
import os

In [5]:
def haveAtSign(url):
    return 1 if "@" in url else 0

def getLength(url):
    return 1 if len(url) >= 54 else 0

def getDepth(url):
    try:
        s = urlparse(url).path.split('/')
        depth = sum(1 for part in s if len(part) != 0)
        return depth
    except:
        return 0

def redirection(url):
    try:
        pos = url.rfind('//')
        return 1 if pos > 6 else 0
    except:
        return 0

def httpDomain(url):
    try:
        domain = urlparse(url).netloc
        return 1 if 'https' in domain else 0
    except:
        return 0

def tinyURL(url):
    try:
        shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                            r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                            r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                            r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                            r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                            r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                            r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                            r"tr\.im|link\.zip\.net"
        match = re.search(shortening_services, url)
        return 1 if match else 0
    except:
        return 0

def prefixSuffix(url):
    try:
        return 1 if '-' in urlparse(url).netloc else 0
    except:
        return 0

def havingIP(url):
    try:
        ipaddress.ip_address(urlparse(url).netloc)
        return 1
    except:
        return 0

def get_whois_info(domain):
    try:
        whois_info = whois.whois(domain)
        return whois_info
    except Exception:
        return None

def dnsRecord(domain_name):
    return 0 if domain_name else 1

def domainAge(domain_name):
    try:
        if not domain_name or not domain_name.creation_date:
            return 1
        creation_date = domain_name.creation_date
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        age_in_days = (datetime.now() - creation_date).days
        return 1 if (age_in_days / 30) < 6 else 0
    except:
        return 1

def domainEnd(domain_name):
    try:
        if not domain_name or not domain_name.expiration_date:
            return 1
        expiration_date = domain_name.expiration_date
        if isinstance(expiration_date, list):
            expiration_date = expiration_date[0]
        end_in_days = (expiration_date - datetime.now()).days
        return 0 if (end_in_days / 30) < 6 else 1
    except:
        return 1

def get_url_response(url):
    try:
        response = requests.get(url, timeout=10, allow_redirects=True)
        return response
    except Exception:
        return None

def iframe(response_text):
    return 0 if response_text and re.findall(r"<iframe|<frameBorder>", response_text) else 1

def mouseOver(response_text):
    return 1 if response_text and re.findall(r"onmouseover", response_text) else 0

def rightClick(response_text):
    return 0 if response_text and re.findall(r"event.button ?== ?2", response_text) else 1

def forwarding(response):
    return 1 if response and len(response.history) > 2 else 0

def has_login_form(response_text):
    try:
        soup = BeautifulSoup(response_text, 'html.parser')
        form_tags = soup.find_all('form')
        for form in form_tags:
            if any(field.get('type') in ['password', 'email'] for field in form.find_all('input')):
                return 1
        return 0
    except:
        return 0

def get_form_action_url(response_text, base_url):
    try:
        soup = BeautifulSoup(response_text, 'html.parser')
        form_tags = soup.find_all('form')
        for form in form_tags:
            action = form.get('action')
            if action:
                action_url = urljoin(base_url, action)
                if urlparse(action_url).netloc != urlparse(base_url).netloc:
                    return 1
        return 0
    except:
        return 0

def get_suspicious_keywords(response_text):
    try:
        suspicious_words = r'login|account|verify|update|urgent|password|security|billing'
        return 1 if re.search(suspicious_words, response_text, re.IGNORECASE) else 0
    except:
        return 0

def webTraffic(url):
    # A API da Alexa não é mais funcional. Usamos um valor de placeholder.
    return 1 if url else 0

def extract_all_features(url, label):
    features = {
        "Domain": urlparse(url).netloc if url else None,
        "Have_IP": havingIP(url),
        "Have_At": haveAtSign(url),
        "URL_Length": getLength(url),
        "URL_Depth": getDepth(url),
        "Redirection": redirection(url),
        "https_Domain": 1 if url.startswith('https') else 0,
        "TinyURL": tinyURL(url),
        "Prefix/Suffix": prefixSuffix(url),
        "DNS_Record": 1,
        "Web_Traffic": 0,
        "Domain_Age": 1,
        "Domain_End": 1,
        "iFrame": 1,
        "Mouse_Over": 0,
        "Right_Click": 1,
        "Web_Forwards": 0,
        "Has_LoginForm": 0,
        "Form_Action_Suspect": 0,
        "Suspicious_Keywords": 0,
        "Label": label
    }
    
    domain = features['Domain']
    if domain:
        whois_info = get_whois_info(domain)
        features['DNS_Record'] = dnsRecord(whois_info)
        features['Domain_Age'] = domainAge(whois_info)
        features['Domain_End'] = domainEnd(whois_info)
    
    response = get_url_response(url)
    if response:
        features['Web_Traffic'] = webTraffic(url)
        features['iFrame'] = iframe(response.text)
        features['Mouse_Over'] = mouseOver(response.text)
        features['Right_Click'] = rightClick(response.text)
        features['Web_Forwards'] = forwarding(response)
        features['Has_LoginForm'] = has_login_form(response.text)
        features['Form_Action_Suspect'] = get_form_action_url(response.text, url)
        features['Suspicious_Keywords'] = get_suspicious_keywords(response.text)
    
    return features

In [6]:
print("Starting feature extraction...")

try:
    phishurl_all = pd.read_csv("Datasets/2.online-valid.csv")
except FileNotFoundError:
    print("Error: 'online-valid.csv' not found. Downloading...")
    os.system("wget http://data.phishtank.com/data/online-valid.csv")
    phishurl_all = pd.read_csv("Datasets/2.online-valid.csv")

phishurl = phishurl_all.sample(n=5000, random_state=12).copy().reset_index(drop=True)

try:
    legiurl_all = pd.read_csv("Datasets/1.Benign_list_big_final.csv", header=None)
    legiurl_all.columns = ['URLs']
except FileNotFoundError:
    print("Error: 'Benign_list_big_final.csv' not found.")
    exit()

legiurl = legiurl_all.sample(n=5000, random_state=12).copy().reset_index(drop=True)

legi_features = [extract_all_features(url, 0) for url in legiurl['URLs']]
legitimate = pd.DataFrame(legi_features)

phish_features = [extract_all_features(url, 1) for url in phishurl['url']]
phishing = pd.DataFrame(phish_features)


Starting feature extraction...


In [7]:
urldata = pd.concat([legitimate, phishing]).reset_index(drop=True)
print("\nFinal Dataset Head:\n", urldata.head())
print("\nFinal Dataset Shape:", urldata.shape)

urldata.to_csv('Datasets/6.full_urldata_features.csv', index=False)
print("\nDataset '6.full_urldata_features.csv' salvo com sucesso!")

print("\n--- Fim do Notebook ---")


Final Dataset Head:
              Domain  Have_IP  Have_At  URL_Length  URL_Depth  Redirection  \
0  graphicriver.net        0        0           1          1            0   
1    foursquare.com        0        0           1          3            0   
2       shop-pro.jp        0        0           1          6            0   
3     motthegioi.vn        0        0           1          2            0   
4        tobogo.net        0        0           1          2            0   

   https_Domain  TinyURL  Prefix/Suffix  DNS_Record  ...  Domain_Age  \
0             0        0              0           1  ...           1   
1             1        0              0           1  ...           1   
2             0        0              1           1  ...           1   
3             0        0              0           1  ...           1   
4             0        0              0           1  ...           1   

   Domain_End  iFrame  Mouse_Over  Right_Click  Web_Forwards  Has_LoginForm  \
0  