In [25]:
import numpy as np
import pandas as pd
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split

In [15]:
# Load the data
df = pd.read_csv("./data/malicious_phish.csv")

In [20]:
# View the data 
print(df.shape)

# Rename type column to target
df = df.rename(columns={'type': 'target'})

# There appears to be a large class imbalance between phishing and other.
print(df['target'].value_counts())

# We will replace "malware", "phishing", and "defacement" with "suspicious"
df['target'] = df['target'].replace(['phishing', 'malware', 'defacement'], 'suspicious')

print(df['target'].value_counts())


(651191, 2)
benign        428103
suspicious    223088
Name: target, dtype: int64
benign        428103
suspicious    223088
Name: target, dtype: int64


In [27]:
# We will extract features from the URL

# Function to extract features from URLs
def extract_url_features(url):
    parsed_url = urlparse(url)
    features = {
        'url_length': len(url),
        'domain_length': len(parsed_url.hostname) if parsed_url.hostname else 0,
        'path_length': len(parsed_url.path),
        'num_dots': url.count('.'),
        'num_special_chars': sum([url.count(char) for char in '/-_?=']),
        'is_ip_address': int(parsed_url.hostname.replace('.', '').isdigit()) if parsed_url.hostname else 0,
        'num_subdomains': len(parsed_url.hostname.split('.')) - 2 if parsed_url.hostname else 0,
    }
    return pd.Series(features)

# Apply feature extraction function to 'url' column
url_features = df['url'].apply(extract_url_features)

# Concatenate extracted features with original DataFrame
df = pd.concat([df, url_features], axis=1)

# Drop the original 'url' column 
df = df.drop(columns=['url'])

df.head()




Unnamed: 0,target,url_length,domain_length,path_length,num_dots,num_special_chars,is_ip_address,num_subdomains
0,suspicious,16,0,16,2,1,0,0
1,benign,35,0,35,2,3,0,0
2,benign,31,0,31,2,3,0,0
3,suspicious,88,21,10,3,11,0,1
4,suspicious,235,23,10,2,9,0,0


In [30]:
# Split the dataset
train_df, test_df = train_test_split(df, test_size=0.30)

X_train = train_df.drop(columns="target")
y_train = train_df["target"]

X_test = test_df.drop(columns="target")
y_test = test_df["target"]



In [37]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.dummy import DummyClassifier

# Dummy classifier
dummy_clf = DummyClassifier(strategy='most_frequent')

# Fit the DummyClassifier on the training data
dummy_clf.fit(X_train, y_train)

# Get dummy prediction on training set
dummy_pred = dummy_clf.predict(X_train)

dummy_score = accuracy_score(dummy_pred, y_train)

print(dummy_score)

0.6572077932049676


In [31]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define the parameter distributions for Random Search
param_dist = {
    'n_estimators': randint(50, 100),  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30, 40, 50],  # Maximum depth of the trees
    'min_samples_split': randint(2, 20),  # Minimum number of samples required to split an internal node
    'min_samples_leaf': randint(1, 20),  # Minimum number of samples required to be a leaf node
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider at every split
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Instantiate the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_dist, n_iter=100, cv=5, 
                                   scoring='accuracy', random_state=42, n_jobs=-1)

# Fit the RandomizedSearchCV object to the data
random_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(f'Best parameters: {random_search.best_params_}')
print(f'Best cross-validation score: {random_search.best_score_:.2f}')

140 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/envs/cpsc330/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/envs/cpsc330/lib/python3.10/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/opt/miniconda3/envs/cpsc330/lib/python3.10/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/miniconda3/envs/cpsc330/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 95

Best parameters: {'bootstrap': False, 'max_depth': 40, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 13, 'n_estimators': 79}
Best cross-validation score: 0.93


In [33]:
# Best parameters: {'bootstrap': False, 'max_depth': 40, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 13, 'n_estimators': 79}

# Get model with best hyperparameters
best_rf = random_search.best_estimator_
best_rf.fit(X_train, y_train)


In [35]:
# Predict on test data
y_pred = best_rf.predict(X_test)

# Calculate accuracy (or any other metric)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy on test set: {accuracy:.2f}')

Accuracy on test set: 0.93


In [41]:
# Fit model on entire training set before saving it
deploy_model = best_rf
X_df = df.drop(columns = "target")
y_df = df["target"]
deploy_model.fit(X_df, y_df)


In [45]:
# Export model using joblib
import joblib

joblib.dump(deploy_model, 'malicious_url_model.pkl')


['malicious_url_model.pkl']