In [40]:
import numpy as np
import pandas as pd

In [41]:
df = pd.read_csv('email.csv')

In [42]:
df.shape

(5572, 2)

In [43]:
df.isna().sum()

Category    0
Message     0
dtype: int64

In [44]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [45]:
df['Category']

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: Category, Length: 5572, dtype: object

In [46]:
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})

In [47]:
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [51]:
X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['Category'], test_size=0.3, random_state=42)

In [52]:
vectorizer = TfidfVectorizer(max_features=500)
X_train_vec = vectorizer.fit_transform(X_train).toarray()
X_test_vec = vectorizer.transform(X_test).toarray()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

# KNN

In [53]:
import numpy as np
from collections import Counter

class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        """Store the training data."""
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        """Predict the class labels for the provided data."""
        predictions = []
        for x in X:
            predictions.append(self._predict(x))
        return np.array(predictions)

    def _predict(self, x):
        """Predict the label for a single instance."""
        # Compute distances from the point to all training points
        distances = np.linalg.norm(self.X_train - x, axis=1)
        
        # Get the indices of the k nearest neighbors
        k_indices = np.argsort(distances)[:self.k]
        
        # Extract the labels of the k nearest neighbors
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        
        # Return the most common class label
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

In [54]:
knn = KNN()
knn.fit(X_train_vec, y_train)

In [55]:
from sklearn.metrics import classification_report

pred = knn.predict(X_test_vec)
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.94      0.99      0.97      1448
           1       0.89      0.62      0.73       224

    accuracy                           0.94      1672
   macro avg       0.92      0.80      0.85      1672
weighted avg       0.94      0.94      0.93      1672



# Logistic regression

In [12]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        """Fit the model to the training data."""
        n_samples, n_features = X.shape
        # Initialize weights and bias
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        # Gradient descent
        for _ in range(self.n_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)

            # Calculate gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            # Update weights and bias
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        """Predict binary labels for the provided data."""
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        # Return binary predictions
        y_predicted_class = [1 if i > 0.5 else 0 for i in y_predicted]
        return np.array(y_predicted_class)

    def sigmoid(self, x):
        """Compute the sigmoid function."""
        return 1 / (1 + np.exp(-x))

In [13]:
log = LogisticRegression()
log.fit(X_train_vec,y_train)

In [14]:
pred = log.predict(X_test_vec)
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.57      1.00      0.73      1582
           1       1.00      0.17      0.28      1415

    accuracy                           0.61      2997
   macro avg       0.79      0.58      0.51      2997
weighted avg       0.77      0.61      0.52      2997



# SVC

In [60]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train_vec,y_train)

pred = svc.predict(X_test_vec)
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1448
           1       1.00      0.90      0.94       224

    accuracy                           0.99      1672
   macro avg       0.99      0.95      0.97      1672
weighted avg       0.99      0.99      0.99      1672



# gaussian naive bayes

In [58]:
import numpy as np

class GaussianNaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.mean = {}
        self.variance = {}
        self.prior = {}
        self.smoothing = 1e-9

        for cls in self.classes:
            X_cls = X[y == cls]
            self.mean[cls] = np.mean(X_cls, axis=0)
            self.variance[cls] = np.var(X_cls, axis=0) + self.smoothing
            self.prior[cls] = np.log(X_cls.shape[0] / X.shape[0])  # Log prior

    def gaussian_probability(self, x, mean, var):
        exponent = - ((x - mean) ** 2) / (2 * var)
        return np.log(1 / np.sqrt(2 * np.pi * var)) + exponent  # Log probability

    def predict(self, X):
        predictions = []
        for x in X:
            class_probabilities = {}
            for cls in self.classes:
                likelihood = self.gaussian_probability(x, self.mean[cls], self.variance[cls])
                class_probabilities[cls] = self.prior[cls] + np.sum(likelihood)  # Log sum
            predictions.append(max(class_probabilities, key=class_probabilities.get))
        return np.array(predictions)

# Example Usage (same as before)


In [59]:
gnb = GaussianNaiveBayes()
gnb.fit(X_train_vec,y_train)

pred = gnb.predict(X_test_vec)
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.98      0.75      0.85      1448
           1       0.36      0.92      0.52       224

    accuracy                           0.77      1672
   macro avg       0.67      0.84      0.68      1672
weighted avg       0.90      0.77      0.80      1672



# Bagging weak learners

In [62]:
def bagged_model(models,X):
    predictions=[]
    for i in X:
        pred = []
        for model in models:
            pred.append(model.predict([i]))
        if sum(pred) / len(models)>0.5:
            predictions.append(1)
        else:
            predictions.append(0)

    return predictions

In [63]:
models = [knn, svc, gnb]
pred = bagged_model(models,X_test_vec)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1448
           1       0.96      0.88      0.92       224

    accuracy                           0.98      1672
   macro avg       0.97      0.94      0.95      1672
weighted avg       0.98      0.98      0.98      1672

