In [1]:
# Load CSV file containing the database into Python
import pandas as pd

# Define the name of the file containing the database
filename = 'malicious_phish.csv'

# Load the CSV file into Python
print('Loading the dataset...')

data = pd.read_csv(filename)

print('Dataset Loaded Successfully!')

print(data.head())

Loading the dataset...
Dataset Loaded Successfully!
                                                 url        type
0                                   br-icloud.com.br    phishing
1                mp3raid.com/music/krizz_kaliko.html      benign
2                    bopsecrets.org/rexroth/cr/1.htm      benign
3  http://www.garage-pirenne.be/index.php?option=...  defacement
4  http://adventure-nicaragua.net/index.php?optio...  defacement


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Preprocess the URLs (e.g., tokenization)
# Here, we'll use TF-IDF vectorization to convert the URLs into numerical features
print('Preprocessing the URLs...')
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['url'])
y = data['type']
print('Preprocessing Completed!')
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression model
print('Training the model...')
model = LogisticRegression(max_iter=1000, verbose=1)
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print('Model trained successfully!')
print(f'Accuracy: {accuracy:.4f}')

# Print classification report
print(classification_report(y_test, y_pred))

# Save the trained model for later use
model_path = 'logistic_regression_model.pkl'
joblib.dump(model, model_path)


Preprocessing the URLs...
Preprocessing Completed!
Training the model...
Model trained successfully!
Accuracy: 0.9489
              precision    recall  f1-score   support

      benign       0.94      0.99      0.96     85778
  defacement       0.99      1.00      0.99     19104
     malware       0.99      0.95      0.97      6521
    phishing       0.92      0.73      0.81     18836

    accuracy                           0.95    130239
   macro avg       0.96      0.91      0.93    130239
weighted avg       0.95      0.95      0.95    130239



['logistic_regression_model.pkl']