In [1]:
%pip install matplotlib pandas numpy seaborn scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: C:\Users\orsin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd
import random

# Define the paths to the CSV files
legit_urls_path = 'resources/legit_urls.csv'
phishing_urls_path = 'resources/phishing_urls.csv'

# Read the CSV files
legit_urls_df = pd.read_csv(legit_urls_path, header=None, names=['url']) #no header given in this csv file. so custom naming
phishing_urls_df = pd.read_csv(phishing_urls_path)

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: C:\Users\orsin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
legit_urls_df.head()

Unnamed: 0,url
0,http://1337x.to/torrent/1048648/American-Snipe...
1,http://1337x.to/torrent/1110018/Blackhat-2015-...
2,http://1337x.to/torrent/1122940/Blackhat-2015-...
3,http://1337x.to/torrent/1124395/Fast-and-Furio...
4,http://1337x.to/torrent/1145504/Avengers-Age-o...


In [3]:
phishing_urls_df.head()

Unnamed: 0,phish_id,url,phish_detail_url,submission_time,verified,verification_time,online,target
0,6557033,http://u1047531.cp.regruhosting.ru/acces-inges...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T22:01:43+00:00,yes,2020-05-09T22:03:07+00:00,yes,Other
1,6557032,http://hoysalacreations.com/wp-content/plugins...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T22:01:37+00:00,yes,2020-05-09T22:03:07+00:00,yes,Other
2,6557011,http://www.accsystemprblemhelp.site/checkpoint...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T21:54:31+00:00,yes,2020-05-09T21:55:38+00:00,yes,Facebook
3,6557010,http://www.accsystemprblemhelp.site/login_atte...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T21:53:48+00:00,yes,2020-05-09T21:54:34+00:00,yes,Facebook
4,6557009,https://firebasestorage.googleapis.com/v0/b/so...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T21:49:27+00:00,yes,2020-05-09T21:51:24+00:00,yes,Microsoft


In [4]:
# Take random samples of 5000 URLs from each dataframe
random.seed(42)  # For reproducibility - every time random is run, u get the same result
legit_sample = legit_urls_df.sample(n=5000, random_state=42)
phishing_sample = phishing_urls_df.sample(n=5000, random_state=42)

# Create new dataframes with only the 'url' column (all uppercase)
legit_df = pd.DataFrame(data={'URL': legit_sample['url']})   # 'URL' is the name of column and column is the url column
phishing_df = pd.DataFrame(data={'URL': phishing_sample.iloc[:, 1]})  # index 1 (2nd column) and name='URL'

In [5]:
# Reset index for legit_sample dataframe
legit_df.reset_index(drop=True, inplace=True)
# drop=true means i dont't want the old index as a separate column and inplace=true returns none and modifies in place
# Reset index for phishing_sample dataframe
phishing_df.reset_index(drop=True, inplace=True)

In [6]:
legit_df.head()

Unnamed: 0,URL
0,http://codecanyon.net/search?date=this-month&l...
1,http://caixa.gov.br/empresa/credito-financiame...
2,http://olx.ua/uk/list/q-%D0%B4%D0%B6%D0%B8%D0%...
3,http://elitedaily.com/wp-content/themes/strang...
4,http://metro.co.uk/2014/10/10/is-big-brother-f...


In [7]:
phishing_df.head()

Unnamed: 0,URL
0,https://iptf.ir/.well-known/acme-challenge/cha...
1,https://lynshirt.com/wp-admin/PayPal/customer_...
2,https://hotdealsaz.com/Secure/inline.php
3,http://lz5.1ee.myftpupload.com/mvc/b105e5a192f...
4,http://claassistencia.com.br/wp-admin/includes...


 # Feature Extraction (1 for Phishing | 0 for legit )

## 1. Address Bar Based Features

In [8]:
from urllib.parse import urlparse,urlencode
import ipaddress
import re

### 1.1 Get Domain (gets the domain after removing www.) - Not used for training

In [9]:
def getDomain(url):  
  domain = urlparse(url).netloc   #get the network location (domain)
  if re.match(r"^www.",domain):   # ^ ensures www is at start and . ensures some char is after www
       domain = domain.replace("www.","")   #remove www. if found
  return domain

### 1.2 IP address - if IP address instead of domain, then phishing

In [10]:
def havingIP(url):
  try:
    ipaddress.ip_address(url)
    ip = 1
  except:
    ip = 0
  return ip

### 1.3 if @ in URL , then phishing (browser ignores everything before it and goes to fake website after it)

In [11]:
def haveAtSign(url):
  if "@" in url:
    at = 1    
  else:
    at = 0    
  return at

### 1.4 if length>=54 , phishing as real address hidden inside it

In [12]:
def getLength(url):
  if len(url) < 54:
    length = 0            
  else:
    length = 1            
  return length

### 1.5 Gets the depth of url (number of subpages : number of non empty parts after splitting with / )

In [13]:
def getDepth(url):
  s = urlparse(url).path.split('/')
  depth = 0
  for j in range(len(s)):
    if len(s[j]) != 0:
      depth = depth+1
  return depth

### 1.6 If position of // (for redirection) is anything other than 6 (http) or 7 (https) - then phishing

In [14]:
def redirection(url):
  pos = url.rfind('//')
  if pos > 6:
    if pos > 7:
      return 1
    else:
      return 0
  else:
    return 0

### 1.7 If HTTP/HTTPS found in domain, then phishing

In [15]:
def httpDomain(url):
  domain = urlparse(url).netloc
  if 'https' in domain:
    return 1
  else:
    return 0

### 1.8 If URL uses shortener (short url leads to required webpage using HTTP redirect) , then phishing

In [16]:
#listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"    # \. means . (escape character) and | means or

In [17]:
def tinyURL(url):
    match=re.search(shortening_services,url)
    if match:
        return 1
    else:
        return 0

### 1.9 If - found in domain of URL , then phishing

In [18]:
def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1            
    else:
        return 0            

# MAKING THE DATAFRAME

In [36]:
def featureExtraction(url,label):
  features = []

  #Address bar based features 
  features.append(getDomain(url))
  features.append(havingIP(url))
  features.append(haveAtSign(url))
  features.append(getLength(url))
  features.append(getDepth(url))
  features.append(redirection(url))
  features.append(httpDomain(url))
  features.append(tinyURL(url))
  features.append(prefixSuffix(url))

  features.append(label)
  
  return features

## LEGIT

In [37]:
from tqdm.notebook import tqdm

In [38]:
legi_features = []
label = 0  # for legit, label will be 0

for i in tqdm(range(0, 5000)):
  url = legit_df['URL'][i]
  legi_features.append(featureExtraction(url,label))

  0%|          | 0/5000 [00:00<?, ?it/s]

In [39]:
feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection', 
                      'https_Domain', 'TinyURL', 'Prefix/Suffix', 'Label']

legitimate = pd.DataFrame(legi_features, columns= feature_names)
legitimate.head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,Label
0,codecanyon.net,0,0,1,1,0,0,0,0,0
1,caixa.gov.br,0,0,1,4,0,0,0,0,0
2,olx.ua,0,0,1,4,0,0,0,0,0
3,elitedaily.com,0,0,1,5,0,0,0,0,0
4,metro.co.uk,0,0,1,4,0,0,0,0,0


## PHISHING

In [43]:
phish_features = []
label = 1
for i in tqdm(range(0, 5000)):
  url = phishing_df['URL'][i]
  phish_features.append(featureExtraction(url,label))

  0%|          | 0/5000 [00:00<?, ?it/s]

In [44]:
feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection', 
                      'https_Domain', 'TinyURL', 'Prefix/Suffix', 'Label']

phishing = pd.DataFrame(phish_features, columns= feature_names)
phishing.head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,Label
0,iptf.ir,0,0,1,4,0,0,0,0,1
1,lynshirt.com,0,0,1,6,0,0,1,0,1
2,hotdealsaz.com,0,0,0,2,0,0,0,0,1
3,lz5.1ee.myftpupload.com,0,0,1,2,0,0,0,0,1
4,claassistencia.com.br,0,0,1,3,0,0,0,0,1


## COMBINING

In [45]:
urldata = pd.concat([legitimate, phishing]).reset_index(drop=True)
urldata.head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,Label
0,codecanyon.net,0,0,1,1,0,0,0,0,0
1,caixa.gov.br,0,0,1,4,0,0,0,0,0
2,olx.ua,0,0,1,4,0,0,0,0,0
3,elitedaily.com,0,0,1,5,0,0,0,0,0
4,metro.co.uk,0,0,1,4,0,0,0,0,0


In [46]:
urldata.tail()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,Label
9995,team.cmgcharleston.com,0,0,1,8,0,0,0,0,1
9996,clubeamigosdopedrosegundo.com.br,0,0,0,1,0,0,0,0,1
9997,ebaymessage.gvnawpknwarz94ejtnbuvbwv.xyz,0,0,1,2,0,0,0,0,1
9998,dongsuh.net,0,0,1,2,0,0,0,0,1
9999,c1800258.ferozo.com,0,0,0,3,0,0,0,0,1


In [47]:
data = urldata.drop(['Domain'], axis = 1).copy()   #drops the domain along axis=1 (column) and assigns its deep copy to data

In [48]:
data = data.sample(frac=1).reset_index(drop=True)  #data.sample shuffles rows, frac=1 means shuffle all rows and reset the index
data.head()

Unnamed: 0,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,Label
0,0,0,0,1,0,0,0,0,1
1,0,0,1,0,0,0,0,0,0
2,0,0,1,5,0,0,0,0,0
3,0,0,1,3,0,0,0,0,1
4,0,0,1,4,0,0,0,0,0


# MAKING THE MODEL

In [49]:
y = data['Label']
X = data.drop('Label',axis=1)
X.shape, y.shape

((10000, 8), (10000,))

In [50]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2, random_state = 12)
X_train.shape, X_test.shape

((8000, 8), (2000, 8))

In [51]:
from sklearn.metrics import accuracy_score

In [54]:
#XGBoost Classification model
from xgboost import XGBClassifier

# instantiate the model
xgb = XGBClassifier(learning_rate=0.4,max_depth=7)   
#sets the learning rate/gradient - more is faster but overfitting
#sets the max depth of individual trees as 7. more allows complex patterns but may overfit
#fit the model
xgb.fit(X_train, y_train)

In [55]:
y_test_xgb = xgb.predict(X_test)
y_train_xgb = xgb.predict(X_train)

In [56]:
acc_train_xgb = accuracy_score(y_train,y_train_xgb)
acc_test_xgb = accuracy_score(y_test,y_test_xgb)

print("XGBoost: Accuracy on training Data: {:.3f}".format(acc_train_xgb))
print("XGBoost : Accuracy on test Data: {:.3f}".format(acc_test_xgb))

XGBoost: Accuracy on training Data: 0.819
XGBoost : Accuracy on test Data: 0.804


In [70]:
def testing(url):
  features = []

  #Address bar based features 
  features.append(havingIP(url))
  features.append(haveAtSign(url))
  features.append(getLength(url))
  features.append(getDepth(url))
  features.append(redirection(url))
  features.append(httpDomain(url))
  features.append(tinyURL(url))
  features.append(prefixSuffix(url))
    
  return [features]     #dont add label here and give back a 2d array 

In [99]:
arr = testing('http://grantland.com/features/mlb-the-30-red-sox-yankees-blue-jays-orioles-al-east/')
val = xgb.predict(arr)
if val==0:
    print("SAFE")
else:
    print("PHISHING")

SAFE


# SAVING THE MODEL

In [100]:
import pickle
pickle.dump(xgb, open("XGBoostClassifier.pickle.dat", "wb"))

In [101]:
loaded_model = pickle.load(open("XGBoostClassifier.pickle.dat", "rb"))
loaded_model

In [None]:
# Importazione dei classificatori
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Inizializzazione dei modelli
rf_model = RandomForestClassifier(random_state=12)
svm_model = SVC(random_state=12)
knn_model = KNeighborsClassifier()
dt_model = DecisionTreeClassifier(random_state=12)
lr_model = LogisticRegression(random_state=12)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Lista dei modelli
models = [
    ('Random Forest', rf_model),
    ('SVM', svm_model),
    ('KNN', knn_model),
    ('Decision Tree', dt_model),
    ('Logistic Regression', lr_model)
]

# Valutazione dei modelli
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'\n{name}:')
    print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
    print('\nClassification Report:')
    print(classification_report(y_test, y_pred))

# TODO : TUNING HYPERPARAMETRI

# Metriche di Valutazione per Modelli di Classificazione

In [None]:
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                           f1_score, confusion_matrix, roc_curve, auc,
                           precision_recall_curve, average_precision_score)
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_model(model, X_test, y_test):
    # Predizioni
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Metriche base
    print("Basic Metrics:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, 
             label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()
    
    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    avg_precision = average_precision_score(y_test, y_pred_proba)
    
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, color='blue', lw=2,
             label=f'PR curve (AP = {avg_precision:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc="lower left")
    plt.show()