In [None]:
import tarfile
import os
import pandas as pd
import numpy as np
import time

from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import precision_score, recall_score
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.utils.class_weight import compute_class_weight


import warnings
warnings.filterwarnings("ignore", category=UserWarning)

### 1. Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/ML-Sec

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/ML-Sec


In [None]:
# # Only required to run once to extract files
# !tar -xvf lingspam_public.tar.gz

In [None]:
data_path = 'lingspam_public/lemm_stop/'

x_train = []
x_test = []
y_train = []
y_test = []

for subdirectory in sorted(os.listdir(data_path)):
  subdirectory_path = os.path.join(data_path, subdirectory)
  if os.path.isdir(subdirectory_path):
    if "part10" not in str(subdirectory_path).split('/'):
      for filename in os.listdir(subdirectory_path):
        with open(os.path.join(subdirectory_path, filename), "r") as file:
          content = file.read()
          label = 1 if "spmsg" in filename else 0
          y_train.append(label)
          x_train.append(content)
      print(f"Successfully loaded {subdirectory_path} into training data")
    else:
      for filename in os.listdir(subdirectory_path):
        with open(os.path.join(subdirectory_path, filename), "r") as file:
          content = file.read()
          label = 1 if "spmsg" in filename else 0
          y_test.append(label)
          x_test.append(content)
      print(f"Successfully loaded {subdirectory_path} into testing data")
  else:
    print(f"Error: {subdirectory_path} does not exist")

Successfully loaded lingspam_public/lemm_stop/part1 into training data
Successfully loaded lingspam_public/lemm_stop/part10 into testing data
Successfully loaded lingspam_public/lemm_stop/part2 into training data
Successfully loaded lingspam_public/lemm_stop/part3 into training data
Successfully loaded lingspam_public/lemm_stop/part4 into training data
Successfully loaded lingspam_public/lemm_stop/part5 into training data
Successfully loaded lingspam_public/lemm_stop/part6 into training data
Successfully loaded lingspam_public/lemm_stop/part7 into training data
Successfully loaded lingspam_public/lemm_stop/part8 into training data
Successfully loaded lingspam_public/lemm_stop/part9 into training data


In [None]:
# # Printing a positive (spam) sample:
# for i in range(100):
#   if y_train[i] == 1:
#     print(f"Sample: \n{x_train[i]}")
#     print(f"Label: {y_train[i]}\n")

In [None]:
# Creating a dataset

X = x_train + x_test
y = y_train + y_test

print(f"x_train: {len(x_train)}, x_test: {len(x_test)}\ny_train: {len(y_train)}, y_test: {len(y_test)}\n\nX: {len(X)}, y: {len(y)}")

x_train: 2602, x_test: 291
y_train: 2602, y_test: 291

X: 2893, y: 2893


### 2. Feature Selection using Information Gain

In [None]:
# Binary Features
count_vectorizer = CountVectorizer(binary=True)
X_binary = count_vectorizer.fit_transform(X)

# Term Frequency Features
tfidf_vectorizer = TfidfVectorizer()
X_tf = tfidf_vectorizer.fit_transform(X)

In [None]:
# Information Gain (Mutual Information)
mi = mutual_info_classif(X_binary, y)
feature_indices = np.argsort(mi)[::-1]

In [None]:
N = [10, 100, 1000]

X_selected = {'binary': {}, 'tf': {}}

for n in N:
  top_N_features = feature_indices[:n]
  X_selected['binary'][n] = X_binary[:, top_N_features]

In [None]:
mi_tf = mutual_info_classif(X_tf, y)
tf_feature_indices = np.argsort(mi_tf)[::-1]

In [None]:
for n in N:
  top_N_features = feature_indices[:n]
  X_selected['tf'][n] = X_tf[:, top_N_features]

In [None]:
feature_names = count_vectorizer.get_feature_names_out()
sorted_features = [feature_names[i] for i in np.argsort(mi)[::-1]]

top_N = 10
# Print the top N words
print(f"Top {top_N} words from Part(1):")
for i, word in enumerate(sorted_features[:top_N]):
    print(i+1, word)

Top 10 words from Part(1):
1 language
2 remove
3 linguistic
4 university
5 free
6 money
7 click
8 our
9 today
10 sell


### 3. Naive Bayes
##### 3.1 Bernoulli NB classifier with binary features;
##### 3.2. Multinomial NB with binary features; and
##### 3.3. Multinomial NB with term frequency (TF) features.

In [None]:
classifiers = [
  ("Bernoulli NB", BernoulliNB()),
  ("Multinomial NB (Binary)", MultinomialNB()),
  ("Multinomial NB (TF)", MultinomialNB())
]

for n in N:
  for name, classifier in classifiers:
    vectorizer = 'tf' if '(TF)' in name.split() else 'binary'

    train_start = time.time()
    classifier.fit(X_selected[vectorizer][n][:2602], y_train)
    train_latency = time.time() - train_start
    test_start = time.time()
    # Predict on the test data
    y_pred = classifier.predict(X_selected[vectorizer][n][2602:])
    test_latency = time.time() - test_start

    # Calculate spam precision and recall
    precision = precision_score(y_test, y_pred, pos_label=1)
    recall = recall_score(y_test, y_pred, pos_label=1)

    print(f"{name} - Top-{n} Features:")
    print(f"Spam Precision: {precision:.6f}")
    print(f"Spam Recall: {recall:.6f}")
    print(f"Training Latency: {train_latency:.6f}s")
    print(f"Inference Latency: {test_latency:.6f}s\n")

Bernoulli NB - Top-10 Features:
Spam Precision: 0.928571
Spam Recall: 0.795918
Training Latency: 0.005000s
Inference Latency: 0.000742s

Multinomial NB (Binary) - Top-10 Features:
Spam Precision: 0.928571
Spam Recall: 0.795918
Training Latency: 0.005762s
Inference Latency: 0.000393s

Multinomial NB (TF) - Top-10 Features:
Spam Precision: 0.000000
Spam Recall: 0.000000
Training Latency: 0.003252s
Inference Latency: 0.000522s

Bernoulli NB - Top-100 Features:
Spam Precision: 1.000000
Spam Recall: 0.714286
Training Latency: 0.004330s
Inference Latency: 0.000912s

Multinomial NB (Binary) - Top-100 Features:
Spam Precision: 0.978723
Spam Recall: 0.938776
Training Latency: 0.002305s
Inference Latency: 0.000320s

Multinomial NB (TF) - Top-100 Features:
Spam Precision: 1.000000
Spam Recall: 0.367347
Training Latency: 0.003072s
Inference Latency: 0.000365s

Bernoulli NB - Top-1000 Features:
Spam Precision: 1.000000
Spam Recall: 0.755102
Training Latency: 0.004656s
Inference Latency: 0.000865s



### 4. Support Vector Machines (SVM)

In [None]:
n = 100 # Number of features used
vectorizers = ['binary', 'tf'] #using and testing both vectorization methods

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.1, 1, 10]
}

svm_model = SVC()

for vectorizer in vectorizers:
  grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='recall')
  grid_search.fit(X_selected[vectorizer][n][:2602], y_train)

  best_params = grid_search.best_params_

  best_svm_model = SVC(**best_params)
  best_svm_model.fit(X_selected[vectorizer][n][:2602], y_train)

  y_pred_svm = best_svm_model.predict(X_selected['tf'][n][2602:])

  precision_svm = precision_score(y_test, y_pred_svm, pos_label=1)
  recall_svm = recall_score(y_test, y_pred_svm, pos_label=1)

  print(f"Best SVM Model with Cross-Validation for {vectorizer} vectorizer:")
  print("Best Hyperparameters:", best_params)
  print(f"Spam Precision (SVM): {precision_svm}")
  print(f"Spam Recall (SVM): {recall_svm}\n")

Best SVM Model with Cross-Validation for binary vectorizer:
Best Hyperparameters: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Spam Precision (SVM): 1.0
Spam Recall (SVM): 0.02040816326530612

Best SVM Model with Cross-Validation for tf vectorizer:
Best Hyperparameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Spam Precision (SVM): 0.9583333333333334
Spam Recall (SVM): 0.9387755102040817



In [None]:
n = 100 # Number of features used
vectorizers = ['binary', 'tf'] #using and testing both vectorization methods

# Calculated class weights since the dataset has large class imbalance and SVM does not perform well on large class imbalances
class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.1, 1, 10]
}

svm_model = SVC(class_weight={0: class_weights[0], 1: class_weights[1]})

for vectorizer in vectorizers:
  grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='recall')
  grid_search.fit(X_selected[vectorizer][n][:2602], y_train)

  best_params = grid_search.best_params_

  best_svm_model = SVC(**best_params, class_weight={0: class_weights[0], 1: class_weights[1]})
  best_svm_model.fit(X_selected[vectorizer][n][:2602], y_train)

  y_pred_svm = best_svm_model.predict(X_selected['tf'][n][2602:])

  precision_svm = precision_score(y_test, y_pred_svm, pos_label=1)
  recall_svm = recall_score(y_test, y_pred_svm, pos_label=1)

  print(f"Best SVM Model with Cross-Validation for {vectorizer} vectorizer:")
  print("Best Hyperparameters:", best_params)
  print(f"Spam Precision (SVM): {precision_svm}")
  print(f"Spam Recall (SVM): {recall_svm}\n")

Best SVM Model with Cross-Validation for binary vectorizer:
Best Hyperparameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Spam Precision (SVM): 1.0
Spam Recall (SVM): 0.6326530612244898

Best SVM Model with Cross-Validation for tf vectorizer:
Best Hyperparameters: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Spam Precision (SVM): 0.9019607843137255
Spam Recall (SVM): 0.9387755102040817

