In [None]:
import pandas as pd
import re
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
from sklearn.inspection import permutation_importance
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, classification_report

plt.style.use('./deeplearning.mplstyle')
RANDOM_STATE = 30

In [None]:
# df = pd.read_csv("training_malicious_url.csv")
df = pd.read_csv("combined_dataset.csv")
# df.head()

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Type'])
print(label_encoder.classes_)

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df[['Content']], y, test_size=0.3, random_state=42)

# Initialize and fit TF-IDF vectorizer on training data
vectorizer = TfidfVectorizer()
X_train_transformed = vectorizer.fit_transform(X_train['Content'])  # Fit only on training data
X_test_transformed = vectorizer.transform(X_test['Content'])  # Transform test data

# Convert to DataFrame (optional)
X_train_df = pd.DataFrame(X_train_transformed.toarray(), columns=vectorizer.get_feature_names_out())
X_test_df = pd.DataFrame(X_test_transformed.toarray(), columns=vectorizer.get_feature_names_out())

In [None]:
#  Initialize the RandomForestClassifier without specifying hyperparameters (uses defaults)
model = RandomForestClassifier(random_state=42)

# Perform cross-validation with 5 folds
cv_scores = cross_val_score(model, X_train_transformed, y_train, cv=5, scoring='accuracy')

# Print cross-validation scores and average
print(f"Cross-validation scores: {cv_scores}")
print(f"Average cross-validation score: {cv_scores.mean():.4f}")

In [None]:

model.fit(X_train_transformed, y_train)
predictions = cross_val_predict(model, X_train_transformed, y_train, cv=5)

conf_matrix = confusion_matrix(y_train, predictions)
# print("Confusion Matrix:\n", conf_matrix)

# Classification Report
class_report = classification_report(y_train, predictions)
print("Classification Report:\n", class_report)

In [None]:
# Get the number of rows in the sparse matrix
n = int(X_train_transformed.shape[0] * 0.8)

# Split the data into training and evaluation sets
X_train_fit, X_train_eval, y_train_fit, y_train_eval = X_train_transformed[:n], X_train_transformed[n:], y_train[:n], y_train[n:]


In [None]:
eval_set = [(X_train_eval,y_train_eval)]
# early_stopping_rounds = 10

# This code builds and trains a smart model to make predictions, and it tries to avoid overfitting by stopping early if the model stops improving.
xgb_model = XGBClassifier(n_estimators = 200, learning_rate = 0.05,verbosity = 1, random_state = RANDOM_STATE, early_stopping_rounds=10)
xgb_model.fit(X_train_fit,y_train_fit, eval_set = [(X_train_eval,y_train_eval)])

In [None]:
xgb_model.best_iteration

In [None]:
print(f"Metrics train:\n\tAccuracy score: {accuracy_score(xgb_model.predict(X_train_transformed), y_train):.4f}")
print(f"Metrics test:\n\tAccuracy score: {accuracy_score(xgb_model.predict(X_test_transformed), y_test):.4f}")

In [None]:
# Get the confusion matrix
cm = confusion_matrix(y_test, xgb_model.predict(X_test_transformed))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
print(classification_report(y_test, xgb_model.predict(X_test_transformed)))

In [148]:
# Example URL to predict
# url = "https://eezxodullgeunn.webflow.io"  # Example phishing URL
url = "www.google.com"  # Example phishing URL

# Transform the URL using the same vectorizer used during training
url_transformed = vectorizer.transform([url])  # Make sure the URL is wrapped in a list

# Predict using the trained model
random_prediction = model.predict(url_transformed)

# Inverse transform the prediction if necessary (e.g., for label encoding)
random_predicted_class = label_encoder.inverse_transform(random_prediction)

# Predict using the trained model
xgb_prediction = xgb_model.predict(url_transformed)

# Inverse transform the prediction if necessary (e.g., for label encoding)
xgb_predicted_class = label_encoder.inverse_transform(xgb_prediction)

# Output the prediction (safe or phishing)
print(f"Random Forest prediction for: '{url}' -> {random_predicted_class[0]}")
print(f"XGBoost Prediction for '{url}' -> {xgb_predicted_class[0]}")

Random Forest prediction for: 'www.google.com' -> harmful
XGBoost Prediction for 'www.google.com' -> harmful


In [None]:
# Example text for prediction (SMS content)
text = "u r subscribed 2 TEXTCOMP 250 wkly comp. 1st wk?s free question follows, subsequent wks charged@150p/msg.2 unsubscribe txt STOP 2 84128,custcare 08712405020"

# Assuming the vectorizer and models are already trained:
# Transform the text using the same vectorizer used during training
text_transformed = vectorizer.transform([text])  # Make sure the text is wrapped in a list

# Predict using the trained RandomForest model
random_prediction = model.predict(text_transformed)

# Inverse transform the prediction if necessary (e.g., for label encoding)
random_predicted_class = label_encoder.inverse_transform(random_prediction)

# Predict using the trained XGBoost model
xgb_prediction = xgb_model.predict(text_transformed)

# Inverse transform the prediction if necessary (e.g., for label encoding)
xgb_predicted_class = label_encoder.inverse_transform(xgb_prediction)

# Output the predictions (safe or phishing)
print(f"Random Forest prediction for: '{text}' -> {random_predicted_class[0]}")
print(f"XGBoost Prediction for '{text}' -> {xgb_predicted_class[0]}")