In [76]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import joblib
from sklearn.metrics import precision_score, recall_score, f1_score

csv_file_path = 'combined_data.csv'
df = pd.read_csv(csv_file_path)
# X = df.drop(['Domain','Label'], axis=1)
# y = df['Label']

# Extract features from URLs (example: using TF-IDF)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Domain'])

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# Train a Decision Tree model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = model.predict(X_train)

# Evaluate the model on the training set
accuracy_train = accuracy_score(y_train, y_train_pred)

# Make predictions on the test set
y_test_pred = model.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
print(f"Training Accuracy: {accuracy_train * 100:.2f}%")
print(f"Test Accuracy: {accuracy_test * 100:.2f}%")

precision_dt = precision_score(y_test, y_test_pred)
recall_dt = recall_score(y_test, y_test_pred)
f1_dt = f1_score(y_test, y_test_pred)
print(f"Decision Tree Precision: {precision_dt:.2f}, Recall: {recall_dt:.2f}, F1 Score: {f1_dt:.2f}")

Training Accuracy: 100.00%
Test Accuracy: 91.00%
Decision Tree Precision: 0.85, Recall: 0.99, F1 Score: 0.91


In [58]:
#joblib.dump(model, 'decision_tree_model.joblib')

['decision_tree_model.joblib']

In [77]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

csv_file_path = 'combined_data.csv'
df = pd.read_csv(csv_file_path)
X = df.drop(['Domain','Label'], axis=1)
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm = SVC(kernel='linear', C=1.0, random_state=12)
svm.fit(X_train, y_train)

y_test_svm = svm.predict(X_test)
y_train_svm = svm.predict(X_train)

acc_train_svm = accuracy_score(y_train,y_train_svm)
acc_test_svm = accuracy_score(y_test,y_test_svm)

print(f"Training Accuracy: {acc_train_svm * 100:.2f}%")
print(f"Test Accuracy: {acc_test_svm * 100:.2f}%")


precision_svm = precision_score(y_test, y_test_svm)
recall_svm = recall_score(y_test, y_test_svm)
f1_svm = f1_score(y_test, y_test_svm)

print(f"SVM Precision: {precision_svm:.2f}, Recall: {recall_svm:.2f}, F1 Score: {f1_svm:.2f}")

Training Accuracy: 86.84%
Test Accuracy: 88.00%
SVM Precision: 1.00, Recall: 0.74, F1 Score: 0.85


In [78]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

csv_file_path = 'combined_data.csv'
df = pd.read_csv(csv_file_path)
X = df.drop(['Domain','Label'], axis=1)
y = df['Label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
y_train_pred = rf_classifier.predict(X_train)
y_test_pred = rf_classifier.predict(X_test)

# Calculate and print the training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")

# Calculate and print the test accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

precision_rf = precision_score(y_test, y_test_pred)
recall_rf = recall_score(y_test, y_test_pred)
f1_rf = f1_score(y_test, y_test_pred)

print(f"Random Forest Precision: {precision_rf:.2f}, Recall: {recall_rf:.2f}, F1 Score: {f1_rf:.2f}")


Training Accuracy: 92.86%
Test Accuracy: 91.00%
Random Forest Precision: 0.96, Recall: 0.84, F1 Score: 0.90


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

csv_file_path = 'combined_data.csv'
df = pd.read_csv(csv_file_path)
X = df.drop(['Domain', 'Label'], axis=1)
y = df['Label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Replace RandomForestClassifier with Logistic Regression
logistic_classifier = LogisticRegression(random_state=42)
logistic_classifier.fit(X_train, y_train)

y_train_pred = logistic_classifier.predict(X_train)
y_test_pred = logistic_classifier.predict(X_test)

# Calculate and print the training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")

# Calculate and print the test accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Calculate and print precision, recall, and F1 score for Logistic Regression
precision_logistic = precision_score(y_test, y_test_pred)
recall_logistic = recall_score(y_test, y_test_pred)
f1_logistic = f1_score(y_test, y_test_pred)

print(f"Logistic Regression Precision: {precision_logistic:.2f}, Recall: {recall_logistic:.2f}, F1 Score: {f1_logistic:.2f}")

Training Accuracy: 87.59%
Test Accuracy: 90.00%
Logistic Regression Precision: 1.00, Recall: 0.79, F1 Score: 0.88


In [63]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

csv_file_path = 'combined_data.csv'
df = pd.read_csv(csv_file_path)
X = df.drop(['Domain','Label'], axis=1)
y = df['Label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate the XGBoost classifier
xgb = XGBClassifier(learning_rate=0.4, max_depth=7)

# Fit the model to the training data
xgb.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = xgb.predict(X_train)

# Evaluate the model on the training set
accuracy_train = accuracy_score(y_train, y_train_pred)

# Make predictions on the test set
y_test_pred = xgb.predict(X_test)

# Evaluate the model on the test set
accuracy_test = accuracy_score(y_test, y_test_pred)
print(f"Training Accuracy: {accuracy_train * 100:.2f}%")
print(f"Test Accuracy: {accuracy_test * 100:.2f}%")

ModuleNotFoundError: No module named 'xgboost'