# Machine Learning-based Phishing Detection

In [None]:
import pandas as pd

# read the csv file and create a pandas data frame
df_raw = pd.read_csv("/content/High_Risk_URL_Dataset.csv")

In [None]:
df_raw.head(10)

In [None]:
# read the structured and labeled data
df = pd.read_csv("/content/structured-labeled-phishing-URL-data.csv")

In [None]:
df.info()

In [None]:
df.head(10)

In [None]:
df.columns

In [None]:
df.label.value_counts()

## Between raw data & structured-labeled data
- Data cleaning
-- Duplicates, missing values
- Feature extraction
-- Run feature functions and obtain numerical value for each feature
-- e.g: number of 'digits', 'words', '?' OR has 'query', 'subdomain' OR length of 'path', 'subdomain', 'host' etc.
- Labeling
-- Binary: 1 (Phishing) | 0 (Legitimate)

## Prepare train and test data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# Shuffle the DataFrame
df = shuffle(df)

# Separate features (X) and target variable (y)
X = df.drop('label', axis=1)
y = df['label']

# Split the data into training and testing sets, maintaining class proportions
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Verify class distribution in the splits (optional)
print("\nClass Distribution in Training Set:")
print(y_train.value_counts())

print("\nClass Distribution in Testing Set:")
print(y_test.value_counts())


In [None]:
print(X_train.shape)
print(y_train.shape)

## Train two ML models

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Initialize the Logistic Regression classifier
lr_classifier = LogisticRegression(random_state=42, max_iter=1000) # Increased max_iter

# Train the Random Forest classifier
rf_classifier.fit(X_train, y_train)

# Train the Logistic Regression classifier
lr_classifier.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set for Random Forest
rf_predictions = rf_classifier.predict(X_test)

# Evaluate Random Forest performance
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
rf_f1 = f1_score(y_test, rf_predictions)

print("Random Forest Performance:")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}")
print(f"F1-score: {rf_f1:.4f}")


# Make predictions on the test set for Logistic Regression
lr_predictions = lr_classifier.predict(X_test)

# Evaluate Logistic Regression performance
lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_precision = precision_score(y_test, lr_predictions)
lr_recall = recall_score(y_test, lr_predictions)
lr_f1 = f1_score(y_test, lr_predictions)

print("\nLogistic Regression Performance:")
print(f"Accuracy: {lr_accuracy:.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall: {lr_recall:.4f}")
print(f"F1-score: {lr_f1:.4f}")


In [None]:
# prompt: classification report

from sklearn.metrics import classification_report

# Generate and print the classification report for Random Forest
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_predictions))

# Generate and print the classification report for Logistic Regression
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, lr_predictions))


## Save the model

In [None]:
# prompt: save the rf_classifier

import joblib

# Save the trained Random Forest classifier to a file
joblib.dump(rf_classifier, 'rf_classifier_model.pkl')


In [None]:
# Load the saved Random Forest model
loaded_rf_model = joblib.load('rf_classifier_model.pkl')

In [None]:
sample_index = 54  # Choose an index from X_train
sample = X_train.iloc[[sample_index]]
sample

In [None]:
sample_label = y_train.iloc[54]
sample_label

In [None]:
# Make a prediction using the loaded model
prediction = loaded_rf_model.predict(sample)
print(f"Prediction for the sample: {prediction[0]}")

What is the **problem** in this process?

The feature extraction time for some features is costly in real-time applications. How can we minimize the total cost of the feature extraction process within the context of an ML pipeline?