# Exercise 1

In [10]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load data
url = "https://pages.mtu.edu/~cai/sat4520/phishing.csv"
data = pd.read_csv(url)

# Split data into training and testing sets
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Implement Classifiers
classifiers = {
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Logistic Regression": LogisticRegression()
}

# Train and Evaluate Classifiers
results = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    results[name] = {
        "accuracy": accuracy,
        "classification_report": report
    }

# Compare Results
for name, result in results.items():
    print(f"Classifier: {name}")
    print(f"Accuracy: {result['accuracy']}")
    print(f"Classification Report:\n{result['classification_report']}\n")

Classifier: KNN
Accuracy: 0.9407507914970602
Classification Report:
              precision    recall  f1-score   support

          -1       0.94      0.92      0.93       956
           1       0.94      0.95      0.95      1255

    accuracy                           0.94      2211
   macro avg       0.94      0.94      0.94      2211
weighted avg       0.94      0.94      0.94      2211


Classifier: Naive Bayes
Accuracy: 0.5829941203075532
Classification Report:
              precision    recall  f1-score   support

          -1       0.51      1.00      0.67       956
           1       1.00      0.27      0.42      1255

    accuracy                           0.58      2211
   macro avg       0.75      0.63      0.55      2211
weighted avg       0.79      0.58      0.53      2211


Classifier: Random Forest
Accuracy: 0.9665309814563546
Classification Report:
              precision    recall  f1-score   support

          -1       0.97      0.95      0.96       956
           1 

In [11]:
# Train Random Forest model
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

# Get feature importances
feature_importances = rf_classifier.feature_importances_
sorted_indices = feature_importances.argsort()[::-1]

# Print feature importances
print("Feature Importances:")
for i, idx in enumerate(sorted_indices):
    print(f"{X.columns[idx]}: {feature_importances[idx]}")

Feature Importances:
SSLfinal_State: 0.32157265319037426
URL_of_Anchor: 0.25920993815091126
having_Sub_Domain: 0.06241913496245572
web_traffic: 0.06036375044240204
Links_in_tags: 0.0430283581862625
Prefix_Suffix: 0.0399380756681938
Request_URL: 0.021623754492056182
SFH: 0.020076671817605623
Links_pointing_to_page: 0.018802491178314052
Domain_registeration_length: 0.017162288177665963
age_of_domain: 0.015825444285369906
Google_Index: 0.013951042285615023
having_IP_Address: 0.013279761655169984
DNSRecord: 0.012952476780661953
Page_Rank: 0.011820067369800773
URL_Length: 0.008945655162490298
HTTPS_token: 0.00660206963206831
having_At_Symbol: 0.005486357337551864
Redirect: 0.005347072390022387
Statistical_report: 0.005237996974752645
Submitting_to_email: 0.004969713774677718
popUpWidnow: 0.0049417919896409465
Abnormal_URL: 0.004520157222581501
Shortining_Service: 0.004477736395913798
Favicon: 0.004219336734353279
on_mouseover: 0.0036681311105155024
double_slash_redirecting: 0.00356426311782

In [12]:
# Identify the top ten features based on feature importance
top_ten_features_indices = sorted_indices[:10]
top_ten_features = X.columns[top_ten_features_indices]

# Create a new dataset with only the top ten features
X_top_ten = X[top_ten_features]

# Split the data into training and testing sets
X_train_top_ten, X_test_top_ten, y_train, y_test = train_test_split(X_top_ten, y, test_size=0.2, random_state=42)

# Implement Classifiers
classifiers = {
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Logistic Regression": LogisticRegression()
}

# Train and Evaluate Classifiers
results_top_ten = {}
for name, clf in classifiers.items():
    clf.fit(X_train_top_ten, y_train)
    y_pred = clf.predict(X_test_top_ten)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    results_top_ten[name] = {
        "accuracy": accuracy,
        "classification_report": report
    }

# Compare Results
for name, result in results_top_ten.items():
    print(f"Classifier: {name}")
    print(f"Accuracy: {result['accuracy']}")
    print(f"Classification Report:\n{result['classification_report']}\n")

Classifier: KNN
Accuracy: 0.9362279511533242
Classification Report:
              precision    recall  f1-score   support

          -1       0.95      0.90      0.92       956
           1       0.93      0.96      0.94      1255

    accuracy                           0.94      2211
   macro avg       0.94      0.93      0.93      2211
weighted avg       0.94      0.94      0.94      2211


Classifier: Naive Bayes
Accuracy: 0.5725915875169606
Classification Report:
              precision    recall  f1-score   support

          -1       0.50      1.00      0.67       956
           1       1.00      0.25      0.40      1255

    accuracy                           0.57      2211
   macro avg       0.75      0.62      0.53      2211
weighted avg       0.79      0.57      0.51      2211


Classifier: Random Forest
Accuracy: 0.9479873360470376
Classification Report:
              precision    recall  f1-score   support

          -1       0.95      0.93      0.94       956
           1 

# Exercise 2

In [9]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import TensorDataset, DataLoader

# Preprocess data
# One-hot encode categorical features
X_train = pd.get_dummies(X_train, columns=X_train.columns[X_train.dtypes == 'int64'])
X_test = pd.get_dummies(X_test, columns=X_test.columns[X_test.dtypes == 'int64'])

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Adjust input size for the model
input_size = X_train_tensor.shape[1]

# Define the Neural Network Architecture
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

hidden_size = 64
output_size = 2

model = NeuralNetwork(input_size, hidden_size, output_size)

# Define Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the Model
batch_size = 64
train_dataset = TensorDataset(X_train_tensor, (y_train_tensor + 1) // 2)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

epochs = 20
for epoch in range(epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluate the Model
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    predicted_labels = predicted * 2 - 1
    accuracy = accuracy_score(y_test_tensor, predicted_labels)
    report = classification_report(y_test_tensor, predicted_labels)

# Report Results
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Accuracy: 0.9552238805970149
Classification Report:
              precision    recall  f1-score   support

          -1       0.96      0.94      0.95       956
           1       0.95      0.97      0.96      1255

    accuracy                           0.96      2211
   macro avg       0.96      0.95      0.95      2211
weighted avg       0.96      0.96      0.96      2211



In [17]:
from torchsummary import summary

# Print the model architecture summary
summary(model, (input_size,))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 64]           4,416
              ReLU-2                   [-1, 64]               0
            Linear-3                    [-1, 2]             130
Total params: 4,546
Trainable params: 4,546
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.02
Estimated Total Size (MB): 0.02
----------------------------------------------------------------
