In [2]:
import pandas as pd

import xgboost as xgb 

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.tools as tls
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)


In [3]:
# Data processing, metrics and modeling
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, confusion_matrix,  roc_curve, precision_recall_curve, accuracy_score, roc_auc_score
from sklearn.ensemble import AdaBoostClassifier

# Stats
import scipy.stats as ss
from scipy import interp
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

from sklearn.metrics import accuracy_score,classification_report, f1_score, precision_score, recall_score
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_curve
from sklearn.metrics import ConfusionMatrixDisplay

In [8]:
df = pd.read_csv("malicious_phish.csv")
print (len(df))
df.type.value_counts()

651191


type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64

In [10]:
Label = LabelEncoder()
df['type'] = Label.fit_transform(df['type'])
print (df.type.value_counts())
df.head()

type
0    428103
1     96457
3     94111
2     32520
Name: count, dtype: int64


Unnamed: 0,url,type
0,br-icloud.com.br,3
1,mp3raid.com/music/krizz_kaliko.html,0
2,bopsecrets.org/rexroth/cr/1.htm,0
3,http://www.garage-pirenne.be/index.php?option=...,1
4,http://adventure-nicaragua.net/index.php?optio...,1


In [11]:
df['url_length'] = df['url'].apply(len)
df['num_dots'] = df['url'].apply(lambda x: x.count('.'))
df['num_slash'] = df['url'].apply(lambda x: x.count('/'))
df['num_redir'] = df['url'].apply(lambda x: x.count('//'))
df['num_dash'] = df['url'].apply(lambda x: x.count('-'))
df['contains_anchor'] = df['url'].str.contains('#')
df['has_https'] = df['url'].str.contains("https")
df.head()

Unnamed: 0,url,type,url_length,num_dots,num_slash,num_redir,num_dash,contains_anchor,has_https
0,br-icloud.com.br,3,16,2,0,0,1,False,False
1,mp3raid.com/music/krizz_kaliko.html,0,35,2,2,0,0,False,False
2,bopsecrets.org/rexroth/cr/1.htm,0,31,2,3,0,0,False,False
3,http://www.garage-pirenne.be/index.php?option=...,1,88,3,3,1,1,False,False
4,http://adventure-nicaragua.net/index.php?optio...,1,235,2,3,1,1,False,False


In [12]:
def contains_unicode(url):
    for char in url:
        if ord(char) > 127:
            return True
    return False

df['contains_unicode'] = df['url'].apply(contains_unicode)
df.head()

Unnamed: 0,url,type,url_length,num_dots,num_slash,num_redir,num_dash,contains_anchor,has_https,contains_unicode
0,br-icloud.com.br,3,16,2,0,0,1,False,False,False
1,mp3raid.com/music/krizz_kaliko.html,0,35,2,2,0,0,False,False,False
2,bopsecrets.org/rexroth/cr/1.htm,0,31,2,3,0,0,False,False,False
3,http://www.garage-pirenne.be/index.php?option=...,1,88,3,3,1,1,False,False,False
4,http://adventure-nicaragua.net/index.php?optio...,1,235,2,3,1,1,False,False,False


In [13]:
input = df.drop(['url', 'type'], axis='columns')
target = df.type
print(target.head())
input.head()

0    3
1    0
2    0
3    1
4    1
Name: type, dtype: int32


Unnamed: 0,url_length,num_dots,num_slash,num_redir,num_dash,contains_anchor,has_https,contains_unicode
0,16,2,0,0,1,False,False,False
1,35,2,2,0,0,False,False,False
2,31,2,3,0,0,False,False,False
3,88,3,3,1,1,False,False,False
4,235,2,3,1,1,False,False,False


In [14]:
len (input)

651191

In [15]:
X_train, X_test, y_train, y_test = train_test_split(input, target, test_size=0.2)
print (len (X_test))
print (len (X_train))

130239
520952


In [18]:
#XGBOOST
# Create the XGBoost classifier model
model = xgb.XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=6, objective='multi:softmax')

# Fit the model on the training data
model.fit(X_train, y_train)


In [19]:
# Predict the target labels for the test data
y_pred = model.predict(X_test)


In [21]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ",accuracy)

Accuracy:  0.8972120486183094


In [22]:
model.score(X_test, y_test)

0.8972120486183094

In [24]:
score = classification_report(y_test, y_pred)
print("Score:", score)

Score:               precision    recall  f1-score   support

           0       0.93      0.96      0.95     85601
           1       0.85      0.93      0.89     19313
           2       0.86      0.76      0.81      6555
           3       0.78      0.61      0.69     18770

    accuracy                           0.90    130239
   macro avg       0.86      0.82      0.83    130239
weighted avg       0.89      0.90      0.89    130239



In [26]:
print("f1_score", f1_score(y_test, y_pred,average="weighted"))
print("Recall", recall_score(y_test,y_pred,average="weighted"))
print("Precision", precision_score(y_test, y_pred,average="weighted"))

f1_score 0.8932147851459079
Recall 0.8972120486183094
Precision 0.8932758602851693


In [27]:
cm = confusion_matrix(y_test, y_pred)
print("Matrix:" , cm )

Matrix: [[82333   599    66  2603]
 [  644 18031   356   282]
 [  215   950  4993   397]
 [ 5291  1607   377 11495]]


In [28]:
print("Mean_Squared_Error",mean_squared_error(y_test, y_pred))
print("G-Mean", geometric_mean_score(y_test, y_pred))
print("kappa", cohen_kappa_score( y_test, y_pred))
print("MCC", matthews_corrcoef(y_test, y_pred))

Mean_Squared_Error 0.637666136871444
G-Mean 0.8044974780308705
kappa 0.798751097596563
MCC 0.8003120548053472


In [29]:
#WITH SAMPLING
from imblearn.over_sampling import SMOTE

# Instantiate SMOTE with desired settings
smote = SMOTE(sampling_strategy='auto',k_neighbors=3, random_state=42)

# Apply SMOTE to training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [31]:
from xgboost import XGBClassifier

# Instantiate the XGBoost model
model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6)

# Fit the model on the resampled training data
model.fit(X_train_resampled, y_train_resampled)


In [33]:
# Predict on test data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.8625910825482382


In [35]:
model.score(X_test, y_test)

0.8625910825482382

In [37]:
score = classification_report(y_test, y_pred)
print("Score:", score)

Score:               precision    recall  f1-score   support

           0       0.97      0.87      0.92     85601
           1       0.84      0.92      0.88     19313
           2       0.71      0.85      0.77      6555
           3       0.59      0.78      0.67     18770

    accuracy                           0.86    130239
   macro avg       0.78      0.85      0.81    130239
weighted avg       0.88      0.86      0.87    130239



In [39]:
print("f1_score", f1_score(y_test, y_pred,average="weighted"))
print("Recall", recall_score(y_test, y_pred,average="weighted"))
print("Precision", precision_score(y_test, y_pred,average="weighted"))

f1_score 0.8690030363815892
Recall 0.8625910825482382
Precision 0.883884043130552


In [40]:
cm = confusion_matrix(y_test, y_pred)
print("Matrix:" , cm )

Matrix: [[74362  1407   552  9280]
 [  188 17784   952   389]
 [   66   622  5552   315]
 [ 2006  1358   761 14645]]


In [44]:
print("Mean_Squared_Error",mean_squared_error(y_test, y_pred))
print("G-Mean", geometric_mean_score(y_test, y_pred))
print("kappa", cohen_kappa_score(y_test, y_pred))
print("MCC", matthews_corrcoef(y_test, y_pred))

Mean_Squared_Error 0.8851342531806908
G-Mean 0.8526846588558603
kappa 0.754162232779592
MCC 0.7602831793471297
