In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Assuming df is your initial DataFrame
# Load the dataset (replace with your actual dataset)
df = pd.read_csv("malicious_phish.csv")

# Feature Engineering and Data Cleaning
df['http_presence'] = df['url'].apply(lambda x: 1 if 'http' in x else 0)
df['url_length'] = df['url'].apply(len)
df['count_digits'] = df['url'].apply(lambda x: sum(c.isdigit() for c in x))
df['count_letters'] = df['url'].apply(lambda x: sum(c.isalpha() for c in x))

# Feature Extraction
X = df[['http_presence', 'url_length', 'count_digits', 'count_letters']]
y = df['type']

# Encode the 'type' labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print(classification_report(y_test, y_pred))

# Function to predict the type of a new URL
def predict_url_type(model, input_url):
    http_presence = 1 if 'http' in input_url else 0
    url_length = len(input_url)
    count_digits = sum(c.isdigit() for c in input_url)
    count_letters = sum(c.isalpha() for c in input_url)

    # Make prediction
    new_data = [[http_presence, url_length, count_digits, count_letters]]
    prediction = model.predict(new_data)

    # Decode the label back to 'type'
    predicted_type = label_encoder.inverse_transform(prediction)[0]

    return predicted_type

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.79
              precision    recall  f1-score   support

           0       0.84      0.93      0.89     85778
           1       0.63      0.95      0.75     19104
           2       0.80      0.51      0.62      6521
           3       0.39      0.05      0.09     18836

    accuracy                           0.79    130239
   macro avg       0.66      0.61      0.59    130239
weighted avg       0.74      0.79      0.74    130239



In [12]:
while True:
    # User input for a new URL
    user_input_url = input("Enter a URL (or 'exit' to quit): ")

    # Check if the user wants to exit
    if user_input_url.lower() == 'exit':
        print("Exiting...")
        break

    # Get prediction using the best model
    prediction_result = predict_url_type(model, user_input_url)

    # Display the prediction with colored text
    if prediction_result == 'benign':
        print("The model predicts that the URL is \033[92m NON-MALICIOUS\033[0m")  # Green color
    else:
        print("The model predicts that the URL is \033[91m MALICIOUS\033[0m")  # Red color


Enter a URL (or 'exit' to quit): www.apple.com




The model predicts that the URL is [92m NON-MALICIOUS[0m
Enter a URL (or 'exit' to quit): exit
Exiting...
