In [None]:
!pip install scikit-learn==1.2.1



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
data = pd.read_csv("/content/malicious_phish.csv")

# Feature extraction
X = data['url']
y = data['type']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert URLs into features using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Train Logistic Regression classifier
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_vect, y_train)

# Evaluate Logistic Regression model
lr_y_pred = lr_model.predict(X_test_vect)
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_y_pred))
print(classification_report(y_test, lr_y_pred))

# Predicting new URLs
new_urls = ["http://example.com", "http://malicious-site.com"]
new_urls_vect = vectorizer.transform(new_urls)
lr_predictions = lr_model.predict(new_urls_vect)
print("Logistic Regression Predictions:", lr_predictions)


Logistic Regression Accuracy: 0.9488555655372046
              precision    recall  f1-score   support

      benign       0.95      0.98      0.97     85778
  defacement       0.96      1.00      0.98     19104
     malware       0.99      0.95      0.97      6521
    phishing       0.90      0.75      0.82     18836

    accuracy                           0.95    130239
   macro avg       0.95      0.92      0.93    130239
weighted avg       0.95      0.95      0.95    130239

Logistic Regression Predictions: ['defacement' 'phishing']


In [None]:
import joblib
# Save the trained model
joblib.dump(lr_model, 'logistic_regression_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']