In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from lightgbm import LGBMClassifier
import warnings
import time
import joblib
warnings.filterwarnings('ignore')


start_time = time.time() #Start timer

#Loading dataset
df = pd.read_csv('data/urls.csv')
print(f"Dataset shape: {df.shape}")
print(f"Time: {time.time() - start_time:.2f}s\n")

# Visualization
sample_size = min(5000, len(df))
df_sample = df.sample(n=sample_size, random_state=42)
plt.figure(figsize=(10, 6))
sns.countplot(x='type', data=df_sample, palette='Set2')
plt.title('Distribution of URL Types', fontsize=12)
plt.xticks(rotation=15)
plt.tight_layout()
plt.savefig('url_distribution.png', dpi=80)
plt.close()

#Encoding labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['type'])
print("Extracting features (vectorized)...")
feature_time = time.time()

#Easy patterns
suspicious_words = ['login', 'signin', 'verify', 'update', 'banking', 'account', 
                   'secure', 'paypal', 'confirm', 'password', 'admin']
common_tlds = {'com', 'org', 'net', 'edu', 'gov', 'uk', 'us', 'in'}

#Feature extraction
urls = df['url'].astype(str)
df['url_length'] = urls.str.len() #Length

#Count operations
df['num_dots'] = urls.str.count(r'\.')
df['num_slashes'] = urls.str.count('/')
df['num_dashes'] = urls.str.count('-')
df['num_underscores'] = urls.str.count('_')
df['num_digits'] = urls.str.count(r'\d')
df['num_params'] = urls.str.count(r'\?')
df['num_ampersands'] = urls.str.count('&')
df['num_equals'] = urls.str.count('=')
df['num_at'] = urls.str.count('@')
df['digit_ratio'] = df['num_digits'] / df['url_length'].replace(0, 1) #Ratio features

#Boolean features
df['has_https'] = urls.str.contains('https', regex=False, case=False).astype(int)
df['has_ip'] = urls.str.contains(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', regex=True).astype(int)
df['has_hex'] = urls.str.contains(r'%[0-9a-fA-F]{2}', regex=True).astype(int)
df['repeated_chars'] = urls.str.contains(r'(.)\1{3,}', regex=True).astype(int)
df['has_double_slash'] = urls.str[8:].str.contains('//', regex=False).fillna(0).astype(int)

#Suspicious words
suspicious_pattern = '|'.join(suspicious_words)
df['has_suspicious'] = urls.str.contains(suspicious_pattern, case=False, regex=True).astype(int)

#TLD extraction
tld_series = urls.str.extract(r'\.([a-zA-Z]{2,})(?:[/?#]|$)', expand=False).fillna('')
df['tld_length'] = tld_series.str.len()
df['is_common_tld'] = tld_series.str.lower().isin(common_tlds).astype(int)
df['num_subdomains'] = (df['num_dots'] - 1).clip(lower=0) #Subdomain count

#Special characters count
special_chars = set(string.punctuation)
df['num_special'] = urls.apply(lambda x: sum(1 for c in x if c in special_chars))

# Shortener detection
shorteners = ['bit.ly', 'goo.gl', 'tinyurl', 't.co', 'ow.ly']
shortener_pattern = '|'.join(shorteners)
df['is_shortened'] = urls.str.contains(shortener_pattern, case=False, regex=True).astype(int)
print(f"Features extracted in {time.time() - feature_time:.2f}s")
print(f"Total time: {time.time() - start_time:.2f}s\n")

#Prepare features
feature_cols = ['url_length', 'num_dots', 'num_slashes', 'num_dashes', 'num_underscores',
                'num_digits', 'num_params', 'num_ampersands', 'num_equals', 'num_at',
                'digit_ratio', 'has_https', 'has_ip', 'has_hex', 'repeated_chars',
                'has_double_slash', 'has_suspicious', 'tld_length', 'is_common_tld',
                'num_subdomains', 'num_special', 'is_shortened']
X = df[feature_cols]
y = df['label_encoded']
print(f"Features: {X.shape[1]}")
print(f"Samples: {X.shape[0]}\n")

#Train-test split
print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

#Single best model
print("Training LightGBM...")
train_start = time.time()
model = LGBMClassifier(
    n_estimators=100,
    max_depth=7,
    learning_rate=0.1,
    num_leaves=31,
    random_state=42,
    verbose=-1,
    n_jobs=-1,
    force_col_wise=True
)
model.fit(X_train, y_train)
train_time = time.time() - train_start

#Predictions
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Training completed in {train_time:.2f}s")
print(f"\n{'='*60}")
print(f"ACCURACY: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"{'='*60}\n")

#Classification
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

#Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix', fontsize=12, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=80)
plt.close()
print("Confusion matrix saved\n")

#Save model and label encoder
joblib.dump(model, 'lightgbm_model.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

#Summary
total_time = time.time() - start_time
print("\n" + "="*60)
print("PERFORMANCE SUMMARY")
print("="*60)
print(f"Dataset size:      {len(df):,} URLs")
print(f"Features:          {len(feature_cols)}")
print(f"Accuracy:          {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Training time:     {train_time:.2f}s")
print(f"Total time:        {total_time:.2f}s ({total_time/60:.2f} min)")
print(f"Processing speed:  {len(df)/total_time:.0f} URLs/second")
print("="*60)

if total_time < 60:
    print(f"\nSUCCESS! Completed in {total_time:.2f} seconds")
else:
    print(f"\nTook {total_time/60:.2f} minutes")

Dataset shape: (99999, 2)
Time: 0.22s

Extracting features (vectorized)...
Features extracted in 2.22s
Total time: 2.64s

Features: 22
Samples: 99999

Splitting data...
Training LightGBM...
Training completed in 1.67s

ACCURACY: 0.9662 (96.62%)

Classification Report:
              precision    recall  f1-score   support

      benign       0.98      0.99      0.98     14652
  defacement       0.93      0.94      0.93      3670
     malware       0.91      0.74      0.82       486
    phishing       0.93      0.89      0.91      1192

    accuracy                           0.97     20000
   macro avg       0.94      0.89      0.91     20000
weighted avg       0.97      0.97      0.97     20000

Confusion matrix saved


PERFORMANCE SUMMARY
Dataset size:      99,999 URLs
Features:          22
Accuracy:          0.9662 (96.62%)
Training time:     1.67s
Total time:        4.84s (0.08 min)
Processing speed:  20682 URLs/second

SUCCESS! Completed in 4.84 seconds
