#Malicious URL Detection with Machine Learning


**Libraries**

In [None]:
!pip install tld

Collecting tld
  Downloading tld-0.13-py2.py3-none-any.whl (263 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m263.8/263.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tld
Successfully installed tld-0.13


In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from urllib.parse import urlparse
from tld import get_tld
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

**Uploading and analysing data**

In [None]:
data = pd.read_csv('/content/drive/MyDrive/malicious-url/malicious_phish.csv')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651191 entries, 0 to 651190
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     651191 non-null  object
 1   type    651191 non-null  object
dtypes: object(2)
memory usage: 9.9+ MB


In [None]:
data.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [None]:
data.shape

(651191, 2)

In [None]:
data.dtypes

url     object
type    object
dtype: object

In [None]:
data.isnull().sum()

url     0
type    0
dtype: int64

In [None]:
data.type.unique()

array(['phishing', 'benign', 'defacement', 'malware'], dtype=object)

In [None]:
types = data.groupby('type', axis=0)
pd.DataFrame(types.size(), columns=['Types'])

Unnamed: 0_level_0,Types
type,Unnamed: 1_level_1
benign,428103
defacement,96457
malware,32520
phishing,94111


**Feature Extraction**

In [None]:
data['url_length'] = data['url'].apply(lambda i: len(str(i)))

In [None]:
data['hostname_length'] = data['url'].apply(lambda i: len(urlparse(i).netloc))

In [None]:
data['path_length'] = data['url'].apply(lambda i: len(urlparse(i).path))

In [None]:
#First Directory Length
def fd_length(url):
    urlpath= urlparse(url).path
    try:
        return len(urlpath.split('/')[1])
    except:
        return 0

data['fd_length'] = data['url'].apply(lambda i: fd_length(i))

In [None]:
#Length of Top Level Domain
data['tld'] = data['url'].apply(lambda i: get_tld(i,fail_silently=True))
def tld_length(tld):
    try:
        return len(tld)
    except:
        return -1

data['tld_length'] = data['tld'].apply(lambda i: tld_length(i))
data = data.drop("tld",1)

  data = data.drop("tld",1)


In [None]:
feature = ['@','?','-','=','.','#','%','+','$','!','*',',','//', 'http', 'https', 'www']
for a in feature:
    data['count_'+a] = data['url'].apply(lambda i: i.count(a))

In [None]:
def digit_count(url):
    digits = 0
    for i in url:
        if i.isnumeric():
            digits = digits + 1
    return digits

def letter_count(url):
    letters = 0
    for i in url:
        if i.isalpha():
            letters = letters + 1
    return letters

def no_of_dir(url):
    urldir = urlparse(url).path
    return urldir.count('/')

In [None]:
data['count-digits']= data['url'].apply(lambda i: digit_count(i))
data['count-letters']= data['url'].apply(lambda i: letter_count(i))
data['count_dir'] = data['url'].apply(lambda i: no_of_dir(i))

**Label Encoding**

In [None]:
label_encoder = LabelEncoder()
data['type'] = label_encoder.fit_transform(data['type'])

In [None]:
data.head()

Unnamed: 0,url,type,url_length,hostname_length,path_length,fd_length,tld_length,count_@,count_?,count_-,...,count_!,count_*,"count_,",count_//,count_http,count_https,count_www,count-digits,count-letters,count_dir
0,br-icloud.com.br,3,16,0,16,0,-1,0,0,1,...,0,0,0,0,0,0,0,0,13,0
1,mp3raid.com/music/krizz_kaliko.html,0,35,0,35,5,-1,0,0,0,...,0,0,0,0,0,0,0,1,29,2
2,bopsecrets.org/rexroth/cr/1.htm,0,31,0,31,7,-1,0,0,0,...,0,0,0,0,0,0,0,1,25,3
3,http://www.garage-pirenne.be/index.php?option=...,1,88,21,10,9,2,0,1,1,...,0,0,0,1,1,0,1,7,63,1
4,http://adventure-nicaragua.net/index.php?optio...,1,235,23,10,9,3,0,1,1,...,0,0,0,1,1,0,0,22,199,1


**Separation of dependent and independent variables**

In [None]:
X = data.iloc[ :, 2:]
y = data['type']

**Splitting of training and test sets**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1, test_size=0.25, random_state=1337)

In [None]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of y_test: ", y_test.shape)

Shape of X_train:  (65119, 24)
Shape of X_test:  (19536, 24)
Shape of y_train:  (65119,)
Shape of y_test:  (19536,)


**Standardization**

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

###ML Algorithms

Decision Tree

In [None]:
model = DecisionTreeClassifier().fit(X_train,y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.97    106837
           1       0.95      0.96      0.96     24009
           2       0.90      0.90      0.90      8082
           3       0.82      0.78      0.80     23870

    accuracy                           0.94    162798
   macro avg       0.91      0.90      0.91    162798
weighted avg       0.94      0.94      0.94    162798



Random Forest Classifier

In [None]:
forest = RandomForestClassifier(n_estimators = 100, random_state=0).fit(X_train, y_train.ravel())
y_pred = forest.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97    106837
           1       0.96      0.99      0.97     24009
           2       0.98      0.90      0.94      8082
           3       0.89      0.82      0.85     23870

    accuracy                           0.96    162798
   macro avg       0.95      0.92      0.93    162798
weighted avg       0.95      0.96      0.95    162798



Logistic Regression

In [None]:
logisticRegression = LogisticRegression()
logisticRegression.fit(X_train, y_train)
print(classification_report(y_test, y_pred))