# Malicious URL Detection Jupyter Notebook
Detect whether a given URL is malicious or benign using a machine learning model trained on your dataset.

In [1]:
# 1. Imports
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

In [2]:
# 2. Load Dataset
df = pd.read_csv('abc.dataset')  # Change path if needed
df = df.dropna()
print(df.head())

## Feature Engineering: URL Features

In [3]:
# 3. Feature Engineering Functions
def count_digits(url):
    return sum(c.isdigit() for c in url)

def count_special_chars(url):
    return len(re.findall(r'[^a-zA-Z0-9]', url))

def has_ip(url):
    return int(bool(re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', url)))

def url_length(url):
    return len(url)

def count_subdomains(url):
    return url.count(".")

def suspicious_keywords(url):
    keywords = ['login', 'verify', 'update', 'secure', 'account', 'webscr', 'banking', 'confirm']
    return int(any(k in url.lower() for k in keywords))

In [4]:
# 4. Add Features to DataFrame
df['url_length'] = df['url'].apply(url_length)
df['count_digits'] = df['url'].apply(count_digits)
df['count_special'] = df['url'].apply(count_special_chars)
df['has_ip'] = df['url'].apply(has_ip)
df['count_subdomains'] = df['url'].apply(count_subdomains)
df['suspicious_kw'] = df['url'].apply(suspicious_keywords)

## Combine Handcrafted Features with URL Text Features (TF-IDF)

In [5]:
# 5. Prepare Features and Labels
X_numeric = df[['url_length', 'count_digits', 'count_special', 'has_ip', 'count_subdomains', 'suspicious_kw']]
y = df['type']  # e.g., 'malicious', 'benign', etc.

# TF-IDF vectorizer for URL text
tfidf = TfidfVectorizer(token_pattern=r'[a-zA-Z0-9]+', max_features=100)
X_tfidf = tfidf.fit_transform(df['url'])

# Combine numeric and text features
from scipy.sparse import hstack
X_all = hstack([X_numeric, X_tfidf])

In [6]:
# 6. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
# 7. Model Training
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [8]:
# 8. Evaluation
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

## Prediction for User-Input URLs
*Works even for URLs not seen during training!*

In [9]:
# 9. Single URL Prediction Function
def predict_url(url):
    feats = [
        url_length(url),
        count_digits(url),
        count_special_chars(url),
        has_ip(url),
        count_subdomains(url),
        suspicious_keywords(url)
    ]
    X_num = np.array(feats).reshape(1, -1)
    X_txt = tfidf.transform([url])
    X_all = hstack([X_num, X_txt])
    pred = clf.predict(X_all)
    pred_prob = clf.predict_proba(X_all)
    return pred[0], pred_prob.max()

In [10]:
# 10. Interactive Prediction
user_url = input("Enter a URL to check if it's malicious or benign: ")
label, confidence = predict_url(user_url)
print(f"Prediction: {label} (Confidence: {confidence:.2f})")

## Save Model for Later Use (Optional)

In [11]:
joblib.dump(clf, 'url_detector_model.pkl')
joblib.dump(tfidf, 'url_detector_tfidf.pkl')