In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.parse import urlparse
import re
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Load data
df = pd.read_csv('../data/malicious_phish.csv')

# Initial inspection
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nClass Distribution:")
print(df['type'].value_counts())

In [None]:
# Plot class distribution
plt.figure(figsize=(10, 6))
df['type'].value_counts().plot(kind='bar', color='steelblue')
plt.title('Distribution of URL Types', fontsize=16, fontweight='bold')
plt.xlabel('URL Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Percentage distribution
print("\nPercentage Distribution:")
print(df['type'].value_counts(normalize=True) * 100)

In [None]:
def extract_features(url):
    """Extract features from URL"""
    features = {}
    
    try:
        parsed = urlparse(url)
        
        # Basic features
        features['url_length'] = len(url)
        features['domain_length'] = len(parsed.netloc)
        features['path_length'] = len(parsed.path)
        
        # Special characters
        features['num_dots'] = url.count('.')
        features['num_hyphens'] = url.count('-')
        features['num_underscores'] = url.count('_')
        features['num_slashes'] = url.count('/')
        features['num_questionmarks'] = url.count('?')
        features['num_equals'] = url.count('=')
        features['num_at'] = url.count('@')
        features['num_ampersand'] = url.count('&')
        features['num_exclamation'] = url.count('!')
        features['num_space'] = url.count(' ')
        features['num_tilde'] = url.count('~')
        features['num_comma'] = url.count(',')
        features['num_plus'] = url.count('+')
        features['num_asterisk'] = url.count('*')
        features['num_hashtag'] = url.count('#')
        features['num_dollar'] = url.count('$')
        features['num_percent'] = url.count('%')
        
        # URL components
        features['has_ip'] = 1 if re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', url) else 0
        features['has_port'] = 1 if ':' in parsed.netloc and '@' not in parsed.netloc else 0
        features['is_https'] = 1 if parsed.scheme == 'https' else 0
        
        # Digits and letters
        features['num_digits'] = sum(c.isdigit() for c in url)
        features['num_letters'] = sum(c.isalpha() for c in url)
        
        # Suspicious patterns
        features['has_suspicious_words'] = 1 if any(word in url.lower() for word in 
            ['login', 'signin', 'account', 'verify', 'secure', 'update', 'confirm', 
             'banking', 'paypal', 'ebay', 'amazon']) else 0
        
        # Domain features
        domain_tokens = parsed.netloc.split('.')
        features['num_subdomains'] = len(domain_tokens) - 2 if len(domain_tokens) > 2 else 0
        
        # Query parameters
        features['num_params'] = len(parsed.query.split('&')) if parsed.query else 0
        
    except Exception as e:
        # Fill with zeros if parsing fails
        for key in features:
            features[key] = 0
    
    return features

# Apply feature extraction
print("Extracting features from URLs...")
features_df = df['url'].apply(extract_features).apply(pd.Series)
features_df['type'] = df['type']

print("\nFeatures extracted:")
print(features_df.head())
print("\nFeature statistics:")
print(features_df.describe())

In [None]:
# Correlation heatmap
plt.figure(figsize=(16, 12))
correlation = features_df.drop('type', axis=1).corr()
sns.heatmap(correlation, annot=False, cmap='coolwarm', center=0)
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Top correlations with target
# First encode target
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
features_df['type_encoded'] = le.fit_transform(features_df['type'])

# Calculate correlations
correlations = features_df.drop(['type'], axis=1).corrwith(features_df['type_encoded']).abs()
top_features = correlations.sort_values(ascending=False).head(15)

plt.figure(figsize=(10, 8))
top_features.plot(kind='barh', color='teal')
plt.title('Top 15 Features Correlated with Phishing', fontsize=14, fontweight='bold')
plt.xlabel('Absolute Correlation')
plt.tight_layout()
plt.show()

In [None]:
# Plot distributions for top features
top_5_features = correlations.sort_values(ascending=False).head(5).index.tolist()

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, feature in enumerate(top_5_features):
    sns.boxplot(data=features_df, x='type', y=feature, ax=axes[idx])
    axes[idx].set_title(f'{feature} by URL Type')
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
import joblib

# Prepare features and target
X = features_df.drop(['type', 'type_encoded'], axis=1)
y = features_df['type_encoded']

# Split data (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler for deployment
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(le, '../models/label_encoder.pkl')

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
import time

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42, eval_metric='mlogloss')
}

# Train and evaluate each model
results = {}

for name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Training {name}...")
    print(f"{'='*50}")
    
    start_time = time.time()
    
    # Train
    model.fit(X_train_scaled, y_train)
    
    # Predict
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    
    training_time = time.time() - start_time
    
    # Store results
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba,
        'training_time': training_time
    }
    
    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"Training Time: {training_time:.2f} seconds")
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))

In [None]:
# Compare accuracies
comparison_df = pd.DataFrame({
    'Model': results.keys(),
    'Accuracy': [results[m]['accuracy'] for m in results],
    'Training Time (s)': [results[m]['training_time'] for m in results]
})

print("\n" + "="*50)
print("MODEL COMPARISON")
print("="*50)
print(comparison_df.to_string(index=False))

# Plot comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy comparison
axes[0].bar(comparison_df['Model'], comparison_df['Accuracy'], color='skyblue')
axes[0].set_title('Model Accuracy Comparison', fontweight='bold')
axes[0].set_ylabel('Accuracy')
axes[0].set_ylim([0.8, 1.0])
axes[0].tick_params(axis='x', rotation=45)

# Training time comparison
axes[1].bar(comparison_df['Model'], comparison_df['Training Time (s)'], color='salmon')
axes[1].set_title('Training Time Comparison', fontweight='bold')
axes[1].set_ylabel('Time (seconds)')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Select best model (highest accuracy)
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
best_model = results[best_model_name]['model']
y_pred_best = results[best_model_name]['y_pred']

print(f"\nBest Model: {best_model_name}")
print(f"Accuracy: {results[best_model_name]['accuracy']:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_best)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=le.classes_, 
            yticklabels=le.classes_)
plt.title(f'Confusion Matrix - {best_model_name}', fontsize=16, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# Save best model
joblib.dump(best_model, '../models/best_model.pkl')
print(f"\nBest model saved to '../models/best_model.pkl'")

In [None]:
if best_model_name in ['Random Forest', 'XGBoost']:
    # Get feature importance
    importance = best_model.feature_importances_
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importance
    }).sort_values('Importance', ascending=False).head(15)
    
    plt.figure(figsize=(10, 8))
    plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'], color='green')
    plt.xlabel('Importance')
    plt.title(f'Top 15 Feature Importance - {best_model_name}', fontweight='bold')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()