<a href="https://colab.research.google.com/github/godkarmik/mal-url/blob/main/mal_url.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install python-whois
!pip install tldextract
!pip install requests
!pip install pandas numpy scikit-learn




In [None]:
import re
import requests
import pandas as pd
import numpy as np
import tldextract
import whois
from datetime import datetime
from urllib.parse import urlparse
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sid321axn/malicious-urls-dataset")

print("Path to dataset files:", path)

In [None]:
!mv /root/.cache/kagglehub/datasets/sid321axn/malicious-urls-dataset/versions/1 /content/

In [None]:
import pandas as pd

df = pd.read_csv("/content/1/malicious_phish.csv")
df.head()


Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [None]:
# Keep only url + type (already correct)
df = df[['url','type']].dropna()

# Encode labels
label_mapping = {
    'benign': 0,
    'defacement': 1,
    'phishing': 2,
    'malware': 3
}

df['label'] = df['type'].map(label_mapping)
df.head()


Unnamed: 0,url,type,label
0,br-icloud.com.br,phishing,2
1,mp3raid.com/music/krizz_kaliko.html,benign,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,1
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1


In [None]:
import re
import tldextract
from urllib.parse import urlparse

def lexical_features(url):
    f = {}
    f['url_length'] = len(url)
    f['dots'] = url.count('.')
    f['digits'] = sum(c.isdigit() for c in url)
    f['special_chars'] = len(re.findall(r'[^\w]', url))

    suspicious = ["login","verify","secure","account","bank","update","free","click","win"]
    f['suspicious_words'] = sum(w in url.lower() for w in suspicious)

    # IP in domain
    f['has_ip'] = 1 if re.match(r'^https?://\d+\.\d+\.\d+\.\d+', url) else 0

    # Subdomains
    ext = tldextract.extract(url)
    f['subdomains'] = len(ext.subdomain.split('.')) if ext.subdomain else 0

    return f


In [None]:
def https_check(url):
    try:
        return 1 if urlparse(url).scheme == "https" else 0
    except:
        return 0


In [None]:
def extract_features(url):
    f = {}
    f.update(lexical_features(url))
    f['https'] = https_check(url)
    return f


In [None]:
API_KEY = "AIzaSyC1ix4sXo4ymAGjPu_xTPF-EZ0L0bC1m10"   # <-- PUT YOUR GOOGLE API KEY HERE

def google_safebrowsing_check(url):
    api_url = f"https://safebrowsing.googleapis.com/v4/threatMatches:find?key={API_KEY}"

    payload = {
        "client": {"clientId": "url-checker", "clientVersion": "1.0"},
        "threatInfo": {
            "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING",
                            "UNWANTED_SOFTWARE", "POTENTIALLY_HARMFUL_APPLICATION"],
            "platformTypes": ["ANY_PLATFORM"],
            "threatEntryTypes": ["URL"],
            "threatEntries": [{"url": url}]
        }
    }

    try:
        response = requests.post(api_url, json=payload)
        result = response.json()

        # If Google flags it → malicious
        return 1 if "matches" in result else 0

    except:
        return 0


In [None]:
df_small = df.sample(200000, random_state=42)

features = []

for u in df_small['url']:
    features.append(extract_features(u))

X = pd.DataFrame(features)
y = df_small['label']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = RandomForestClassifier(n_estimators=250, random_state=42)
model.fit(X_train, y_train)

pred = model.predict(X_test)
print("ML Accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))


ML Accuracy: 0.870825
              precision    recall  f1-score   support

           0       0.90      0.93      0.92     26558
           1       0.81      0.83      0.82      5831
           2       0.74      0.63      0.68      5659
           3       0.93      0.87      0.90      1952

    accuracy                           0.87     40000
   macro avg       0.85      0.81      0.83     40000
weighted avg       0.87      0.87      0.87     40000



In [None]:
def predict_with_google_boost(url):

    # --- ML Feature Extraction ---
    features = extract_features(url)
    X_in = pd.DataFrame([features])

    # ML prediction probabilities
    ml_probs = model.predict_proba(X_in)[0]

    # ML raw predicted class
    ml_pred = ml_probs.argmax()

    # --- Google Safe Browsing Check ---
    google_flag = google_safebrowsing_check(url)

    # -------- Boosting Logic --------
    # If Google flags it → override ML with MALICIOUS (label 3)
    if google_flag == 1:
        final_pred = 3
    else:
        final_pred = ml_pred

    return final_pred, ml_pred, google_flag


In [None]:
label_names = ["Benign", "Defacement", "Phishing", "Malware"]

while True:
    test_url = input("\nEnter URL to check (or 'exit'): ")

    if test_url.lower() == "exit":
        break

    final_pred, ml_only, google_flag = predict_with_google_boost(test_url)

    print("\n==========================")
    print("ML Prediction:", label_names[ml_only])
    print("Google Flag:", "Malicious" if google_flag == 1 else "Safe")
    print("FINAL RESULT:", label_names[final_pred])
    print("==========================")


Enter URL to check (or 'exit'): google.com

ML Prediction: Phishing
Google Flag: Safe
FINAL RESULT: Phishing

Enter URL to check (or 'exit'): www.facebook.com

ML Prediction: Phishing
Google Flag: Safe
FINAL RESULT: Phishing

Enter URL to check (or 'exit'): corporationwiki.com/Ohio/Columbus/frank-s-benson-P3333917.aspx

ML Prediction: Benign
Google Flag: Safe
FINAL RESULT: Benign

Enter URL to check (or 'exit'): http://larcadelcarnevale.com/catalogo/palloncini

ML Prediction: Benign
Google Flag: Safe
FINAL RESULT: Benign

Enter URL to check (or 'exit'): http://www.824555.com/app/member/SportOption.php?uid=guest&langx=gb

ML Prediction: Malware
Google Flag: Safe
FINAL RESULT: Malware


In [None]:
!apt-get install git


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.15).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


In [None]:
!git config --global user.email "bnegi8190@gmail.com"
!git config --global user.name "godkarmik"


In [None]:
!git clone https://github.com/godkarmik/mal-url


Cloning into 'mal-url'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 6 (delta 1), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (6/6), done.
Resolving deltas: 100% (1/1), done.
