In [1]:
# Importing necessary packages
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import os

In [2]:
# Importing the dataset
data = pd.read_csv(os.path.join('data', 'malicious_phish.csv'))

In [3]:
data.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


# Preprocess Pipeline

In [4]:
# Scikit packages for creating pipelines
from sklearn.base import BaseEstimator, TransformerMixin

## Combining the classes

In [5]:
class PreprocessLabels(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        
        # Transforming target labels to binary
        X['type'] = X['type'].apply(lambda x: 1 if x in ['phishing', 'defacement', 'malware'] else 0)
        
        return X['type']

## Bundle One

In [6]:
import re
from urllib.parse import urlparse, parse_qs
import ipaddress
from collections import Counter

In [7]:
# URL LENGTH

def get_url_len(url):
    return len(url)

# DOMAIN LENGTH

def extract_domain_length(url):
    try:
        # Add 'http://' if no scheme is present
        if not url.startswith(('http://', 'https://')):
            url = 'http://' + url
        
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        
        # Return the length of the domain
        return len(domain) if domain else 0
    
    except Exception as e:
        return 0 

# HTTP

def check_http(url):
    if url.startswith('http://'):
        return 1
    else:
        return 0

# SPECIAL CHARACTER COUNT

def count_special_chars(url):
    non_alpha_num = re.findall(r'\W',url)
    return len(non_alpha_num)

# DIGIT COUNT

def count_digits(url):
    digits = re.findall(r'\d',url)
    return len(digits)

# CHECK IP ADDRESS 

def has_ip_address(url):
    try:
        parsed_url = urlparse(url)
        if parsed_url.hostname:
            ip = ipaddress.ip_address(parsed_url.hostname)
            return isinstance(ip, (ipaddress.IPv4Address, ipaddress.IPv6Address))
    except ValueError:
        pass  
    return 0

# URL PARAMETERS

def count_url_parameters(url):
    # Parse the URL using urlparse
    parsed_url = urlparse(url)
    
    # Extract the query part of the URL
    query = parsed_url.query
    
    # Parse the query parameters using parse_qs
    parameters = parse_qs(query)
    
    # Return the number of parameters
    return len(parameters)

In [8]:
class BundleOne(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Adding new features 
        X['url_length'] = X['url'].apply(get_url_len)
        X['domain_length'] = X['url'].apply(extract_domain_length)
        X['http_check'] = X['url'].apply(check_http)
        X['special_char_count'] = X['url'].apply(count_special_chars)
        X['digit_count'] = X['url'].apply(count_digits)
        X['has_ip_address'] = X['url'].apply(has_ip_address)
        X['url_parameter_count'] = X['url'].apply(count_url_parameters)

        return X

## Bundle Two

In [9]:
# PHP

def check_php_in_url(url):
    # Check if the term 'php' is present in the URL (case-insensitive)
    if 'php' in url.lower():
        return 1
    else:
        return 0

# HTML

def check_html_in_url(url):
    # Check if the term 'php' is present in the URL (case-insensitive)
    if 'html' in url.lower():
        return 1
    else:
        return 0

# TLD

tld_list = [
    '.tk', '.buzz', '.xyz', '.top', '.ga', '.ml', '.info', '.cf', '.gq', '.icu', '.wang', '.live', '.host', '.shop' , '.top', '.icu', '.vip', '.id', '.cc', '.br', '.ci', '.zw', '.sx', '.mw'
]

def check_mal_tld(url):
    parsed_url = urlparse(url)
    netloc = parsed_url.netloc.lower()

    if any(netloc.endswith(tld) for tld in tld_list):
        return 1
    return 0

# SHORTENED URL

def is_shortened_url(url):
    shortened_services = [
        "bit.ly", "tinyurl.com", "goo.gl", "t.co", "ow.ly", "buff.ly", 
        "is.gd", "adf.ly", "bit.do", "cutt.ly", "v.gd", "shorte.st", 
        "bl.ink", "x.co", "s.id", "trib.al"
    ]
    
    parsed_url = urlparse(url)
    netloc = parsed_url.netloc.lower()

    if any(service in netloc for service in shortened_services):
        return 1
    return 0


In [10]:
class BundleTwo(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Adding new features 
        X['has_php'] = X['url'].apply(check_php_in_url)
        X['has_html'] = X['url'].apply(check_html_in_url)
        X['mal_tld'] = X['url'].apply(check_mal_tld)
        X['shortened'] = X['url'].apply(is_shortened_url)

        return X

# Bundle Three

In [11]:
import math

def calculate_entropy(url):
    # Create a frequency distribution of characters
    freqs = {}
    for char in url:
        freqs[char] = freqs.get(char, 0) + 1
    
    # Calculate the entropy
    entropy = 0
    length = len(url)
    for count in freqs.values():
        prob = count / length
        entropy -= prob * math.log(prob, 2)
    
    return entropy


def count_keywords(url):
    keywords = ['login', 'verify', 'account', 'secure', 'update', 'reset', 'payment', 'admin']
    count = sum(url.lower().count(keyword) for keyword in keywords)
    return count

def digit_to_letter_ratio(url):
    digits = len(re.findall(r'\d', url))
    letters = len(re.findall(r'[a-zA-Z]', url))
    if letters == 0:
        return 0
    return digits / letters

In [12]:
class BundleThree(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Adding new features 
        X['entropy'] = X['url'].apply(calculate_entropy)
        X['keyword_count'] = X['url'].apply(count_keywords)
        X['digit_to_letter_ratio'] = X['url'].apply(digit_to_letter_ratio)
        
        return X

## Drop Data

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [14]:
class DropData(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self

    def transform(self, X):
        
        X = X.drop(columns=['type', 'url'], errors='ignore')
        return X

## Pipeline all this 📦

In [15]:
from sklearn.pipeline import Pipeline

In [16]:
pipe = Pipeline([
    ('bundleone', BundleOne()),
    ('bundletwo', BundleTwo()),
    ('bundlethree', BundleThree()),
    ('dropdata', DropData()),
])

X = pipe.fit_transform(data)

In [17]:
X

Unnamed: 0,url_length,domain_length,http_check,special_char_count,digit_count,has_ip_address,url_parameter_count,has_php,has_html,mal_tld,shortened,entropy,keyword_count,digit_to_letter_ratio
0,16,16,0,3,0,0,0,0,0,0,0,3.375000,0,0.000000
1,35,11,0,4,1,0,0,0,1,0,0,4.079143,0,0.034483
2,31,14,0,5,1,0,0,0,0,0,0,3.708093,0,0.040000
3,88,21,1,16,7,0,4,1,0,0,0,4.660343,0,0.111111
4,235,23,1,13,22,0,3,1,0,0,0,5.491293,0,0.110553
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651186,39,15,0,6,12,0,0,0,1,0,0,4.355539,0,0.571429
651187,44,18,0,8,7,0,0,0,0,0,0,4.243300,0,0.241379
651188,42,16,0,6,3,0,0,0,0,0,0,4.147921,0,0.090909
651189,45,16,0,6,0,0,0,0,0,0,0,4.102313,0,0.000000


## Train Test Split

In [18]:
label_pipe = Pipeline([
    ('preprocesslabels', PreprocessLabels()),
])

y = label_pipe.fit_transform(data)

In [19]:
y

0         1
1         0
2         0
3         1
4         1
         ..
651186    1
651187    1
651188    1
651189    1
651190    1
Name: type, Length: 651191, dtype: int64

In [20]:
X = X.astype(np.float32)
y = y.astype(np.float32)

In [21]:
y

0         1.0
1         0.0
2         0.0
3         1.0
4         1.0
         ... 
651186    1.0
651187    1.0
651188    1.0
651189    1.0
651190    1.0
Name: type, Length: 651191, dtype: float32

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Validation dataset

In [24]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Random Forest Classifier

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [26]:
X_train

Unnamed: 0,url_length,domain_length,http_check,special_char_count,digit_count,has_ip_address,url_parameter_count,has_php,has_html,mal_tld,shortened,entropy,keyword_count,digit_to_letter_ratio
311683,238.0,12.0,1.0,14.0,36.0,0.0,3.0,1.0,0.0,0.0,0.0,5.573352,0.0,0.192513
609373,25.0,13.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.623465,0.0,0.000000
110835,14.0,14.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.521641,0.0,0.000000
486865,76.0,17.0,0.0,8.0,3.0,0.0,2.0,0.0,0.0,0.0,0.0,4.439692,0.0,0.046875
228527,103.0,8.0,1.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.197085,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
648599,48.0,14.0,0.0,9.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,4.462581,0.0,0.266667
251247,92.0,10.0,1.0,17.0,14.0,0.0,0.0,0.0,1.0,0.0,0.0,4.556740,0.0,0.229508
312786,26.0,26.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.613337,0.0,0.000000
605604,25.0,13.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.103465,0.0,0.000000


In [27]:
y_train

311683    1.0
609373    1.0
110835    1.0
486865    0.0
228527    0.0
         ... 
648599    1.0
251247    0.0
312786    1.0
605604    1.0
129504    0.0
Name: type, Length: 349037, dtype: float32

In [28]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [29]:
rf_classifier.fit(X_train, y_train)

In [30]:
y_pred = rf_classifier.predict(X_test)
y_pred

array([1., 0., 1., ..., 0., 0., 0.], dtype=float32)

In [31]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.93      0.97      0.95    141516
         1.0       0.94      0.87      0.90     73378

    accuracy                           0.93    214894
   macro avg       0.94      0.92      0.93    214894
weighted avg       0.93      0.93      0.93    214894



In [32]:
def extract_features(url):
    features_dict = {
        'url_length': get_url_len(url),
        'domain_length': extract_domain_length(url),
        'http_check': check_http(url),
        'special_char_count': count_special_chars(url),
        'digit_count': count_digits(url),
        'has_ip_address': has_ip_address(url),
        'url_parameter_count': count_url_parameters(url),
        'has_php': check_php_in_url(url),
        'has_html': check_html_in_url(url),
        'mal_tld': check_mal_tld(url),
        'shortened': is_shortened_url(url),
        'entropy': calculate_entropy(url),
        'keyword_count': count_keywords(url),
        'digit_to_letter_ratio': digit_to_letter_ratio(url),
    }
    
    # Create DataFrame with the same column names as the training data
    return pd.DataFrame([features_dict])


# Testing on singular links

## URL 1

#### Prediction : `benign` Actual : `benign`

In [33]:
url = '192.com/atoz/people/oakley/patrick/'

In [34]:
features = extract_features(url)
features

Unnamed: 0,url_length,domain_length,http_check,special_char_count,digit_count,has_ip_address,url_parameter_count,has_php,has_html,mal_tld,shortened,entropy,keyword_count,digit_to_letter_ratio
0,35,7,0,6,3,0,0,0,0,0,0,3.932874,0,0.115385


In [35]:
predicted_class = rf_classifier.predict(features)

In [36]:
predicted_class

array([0.], dtype=float32)

## URL 2

#### Prediction : `malicious` Actual : `malicious`

In [59]:
url_2 = 'http://www.garage-pirenne.be/index.php?option=com_content&view=article&id=70&vsig70_0=15'

In [60]:
features_2 = extract_features(url_2)
features_2

Unnamed: 0,url_length,domain_length,http_check,special_char_count,digit_count,has_ip_address,url_parameter_count,has_php,has_html,mal_tld,shortened,entropy,keyword_count,digit_to_letter_ratio
0,88,21,1,16,7,0,4,1,0,0,0,4.660343,0,0.111111


In [61]:
predicted_class_2 = rf_classifier.predict(features_2)

In [62]:
predicted_class_2

array([1.], dtype=float32)

# Save the model

In [41]:
from joblib import dump

In [42]:
dump(rf_classifier, 'random_forest_model.joblib')

['random_forest_model.joblib']