In [None]:
pip install pandas scikit-learn




In [None]:
import pandas as pd

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
column_names = [f'feature_{i}' for i in range(1, 58)] + ['spam']
data = pd.read_csv(url, header=None, names=column_names)

# Inspect the data
print(data.head())


   feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0       0.00       0.64       0.64        0.0       0.32       0.00   
1       0.21       0.28       0.50        0.0       0.14       0.28   
2       0.06       0.00       0.71        0.0       1.23       0.19   
3       0.00       0.00       0.00        0.0       0.63       0.00   
4       0.00       0.00       0.00        0.0       0.63       0.00   

   feature_7  feature_8  feature_9  feature_10  ...  feature_49  feature_50  \
0       0.00       0.00       0.00        0.00  ...        0.00       0.000   
1       0.21       0.07       0.00        0.94  ...        0.00       0.132   
2       0.19       0.12       0.64        0.25  ...        0.01       0.143   
3       0.31       0.63       0.31        0.63  ...        0.00       0.137   
4       0.31       0.63       0.31        0.63  ...        0.00       0.135   

   feature_51  feature_52  feature_53  feature_54  feature_55  feature_56  \
0         0.0       0

In [None]:
from sklearn.model_selection import train_test_split

# Features and labels
X = data.drop('spam', axis=1)
y = data['spam']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [None]:
from sklearn.svm import SVC
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# SVM kernel functions
kernels = ['linear', 'poly', 'rbf']


#Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Tuning Hyperparameters
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}

for kernel in kernels:
  clf = GridSearchCV(SVC(kernel=kernel), param_grid, scoring='accuracy', cv=5)
  clf.fit(X_train_scaled, y_train)
  best_svm = clf.best_estimator_

  predictions = best_svm.predict(X_test_scaled)
  accuracy = metrics.accuracy_score(y_test, predictions)
  print("Accuracy for", kernel,"kernel:", accuracy)
  print(confusion_matrix(y_test, predictions))
  print(classification_report(y_test, predictions))



Accuracy for linear kernel: 0.9256089532587228
[[844  42]
 [ 71 562]]
              precision    recall  f1-score   support

           0       0.92      0.95      0.94       886
           1       0.93      0.89      0.91       633

    accuracy                           0.93      1519
   macro avg       0.93      0.92      0.92      1519
weighted avg       0.93      0.93      0.93      1519

Accuracy for poly kernel: 0.8505595786701777
[[863  23]
 [204 429]]
              precision    recall  f1-score   support

           0       0.81      0.97      0.88       886
           1       0.95      0.68      0.79       633

    accuracy                           0.85      1519
   macro avg       0.88      0.83      0.84      1519
weighted avg       0.87      0.85      0.85      1519

Accuracy for rbf kernel: 0.9335088874259381
[[853  33]
 [ 68 565]]
              precision    recall  f1-score   support

           0       0.93      0.96      0.94       886
           1       0.94      0.8