# URL Scanner ML Model

This notebook contains the machine learning model for URL scanning.

## Instructions

1. Replace this notebook with your trained model
2. Update the API endpoint in the environment variables
3. Configure the model settings in `ml-config.ts`

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Sample code for URL feature extraction
def extract_features(url):
    # Extract features from URL
    features = {
        'url_length': len(url),
        'has_ip': 1 if re.search(r'\d+\.\d+\.\d+\.\d+', url) else 0,
        'has_at_symbol': 1 if '@' in url else 0,
        'has_double_slash': 1 if '//' in url[7:] else 0,
        'has_dash': 1 if '-' in url else 0,
        'has_subdomain': url.count('.') > 1,
        # Add more features as needed
    }
    return features

# Sample code for model training
def train_model(X, y):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate model
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    return model

# Sample code for prediction
def predict(url, model):
    features = extract_features(url)
    X = pd.DataFrame([features])
    prediction = model.predict(X)[0]
    probability = model.predict_proba(X)[0][1]
    
    return {
        'is_malicious': bool(prediction),
        'confidence': float(probability),
        'features': features
    }