In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('malicious_phish.csv')

In [3]:
print(df.shape)
df.head()

(651191, 2)


Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [4]:
df.type.value_counts()

Unnamed: 0_level_0,count
type,Unnamed: 1_level_1
benign,428103
defacement,96457
phishing,94111
malware,32520


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
X = df['url']
y = df['type']

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [12]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_vec, y_train)
y_pred_logreg = logreg.predict(X_test_vec)

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_vec, y_train)
y_pred_dt = dt.predict(X_test_vec)

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_vec, y_train)
y_pred_rf = rf.predict(X_test_vec)

# XGBoost
xgboost = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgboost.fit(X_train_vec, y_train)
y_pred_xgb = xgboost.predict(X_test_vec)

# Evaluation for all models
def evaluate_model(y_true, y_pred, model_name):
    print(f"{model_name} Accuracy: {accuracy_score(y_true, y_pred)}")
    print(f"{model_name} Classification Report:")
    print(classification_report(y_true, y_pred, target_names=le.classes_))
    print(f"{model_name} Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

evaluate_model(y_test, y_pred_logreg, "Logistic Regression")
evaluate_model(y_test, y_pred_dt, "Decision Tree")
evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_xgb, "XGBoost")

Parameters: { "use_label_encoder" } are not used.



Logistic Regression Accuracy: 0.9322860279946867
Logistic Regression Classification Report:
              precision    recall  f1-score   support

      benign       0.93      0.98      0.96     85778
  defacement       0.96      0.98      0.97     19104
     malware       0.98      0.92      0.95      6521
    phishing       0.87      0.66      0.75     18836

    accuracy                           0.93    130239
   macro avg       0.94      0.89      0.91    130239
weighted avg       0.93      0.93      0.93    130239

Logistic Regression Confusion Matrix:
[[84243    82    29  1424]
 [   35 18715     5   349]
 [  243   114  6002   162]
 [ 5680   625    71 12460]]
Decision Tree Accuracy: 0.9483948740392663
Decision Tree Classification Report:
              precision    recall  f1-score   support

      benign       0.95      0.98      0.97     85778
  defacement       0.98      0.98      0.98     19104
     malware       0.98      0.94      0.96      6521
    phishing       0.88      

In [19]:
print('This was best accuracy among all models')
evaluate_model(y_test, y_pred_rf, "Random Forest")

This was best accuracy among all models
Random Forest Accuracy: 0.953370342217001
Random Forest Classification Report:
              precision    recall  f1-score   support

      benign       0.95      0.98      0.97     85778
  defacement       0.98      0.99      0.98     19104
     malware       0.99      0.95      0.97      6521
    phishing       0.90      0.78      0.83     18836

    accuracy                           0.95    130239
   macro avg       0.96      0.92      0.94    130239
weighted avg       0.95      0.95      0.95    130239

Random Forest Confusion Matrix:
[[84488    14    16  1260]
 [   34 18864     6   200]
 [  137    34  6165   185]
 [ 3825   323    39 14649]]


In [16]:
sample_links = [
    "http://example.com/phishing-link",
    "https://secure-site.com",
    "http://malicious-malware-download.com",
    "https://safe-site.com"
]
model = rf
sample_links_vectorized = vectorizer.transform(sample_links)
predictions = model.predict(sample_links_vectorized)

# Decode the predicted labels
decoded_predictions = le.inverse_transform(predictions)

for link, prediction in zip(sample_links, decoded_predictions):
    print(f"Link: {link} -> Predicted Label: {prediction}")

Link: http://example.com/phishing-link -> Predicted Label: phishing
Link: https://secure-site.com -> Predicted Label: phishing
Link: http://malicious-malware-download.com -> Predicted Label: benign
Link: https://safe-site.com -> Predicted Label: phishing


In [17]:
import joblib
joblib.dump(rf, 'link_detector.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']