In [1]:
import pandas as pd
df_data = pd.read_csv("./data/spam_ham_dataset.csv")

## Preprocess data

- remove subject:
- separate subject and content
- in content - 
    - remove punctuations
    - convert email into lowercase
    - strip off email headers 
    - replace url with URL
    - replace numbers with NUMBER



In [2]:
def take_subject_content(x, subject_ = True):
    if subject_:
        x  = x.split("\r")[0]
        try:
            x = x.split("Subject:")[1]
        except:
            return x
        return x
    else:
        try:
            x  = "\r".join(x.split("\r")[1:])
            return x
        except:
            return x


df_data["Subject"] = df_data["text"].apply(lambda x: take_subject_content(x, True))
df_data["Content"] = df_data["text"].apply(lambda x: take_subject_content(x, False))

df_data["Content"]  = df_data["Content"]  + df_data["Subject"] 


In [3]:
import re
def strip_text(x , strips_ = ["\n", " ", "\r", '-']):
    for strip_ in strips_:
        x = x.strip(strip_)
        x = x.replace(strip_, " ")

    return x

def remove_punctuations_and_more(x):

    x = re.sub(r'[^\w\s]',' ',x)
    return x

def check_num(x):
    try:
        x = float(x)
        return True
    except:
        return False
    
def replace_num_url(x):
    words = x.split(" ")
    urls = [x for x in words if ("https://" in x) or ("http://" in x) or ("www." in x)]

    for url in urls:
        x = x.replace(url , " URL ")
    
    nums = [x for x in words if check_num(x)]


    for num in nums:
        x = x.replace(num , " NUMBER ")
    
    return x

In [4]:
df_data["Content"] = df_data["Content"].apply(lambda x: x.lower())
df_data["Content"] = df_data["Content"].apply(strip_text)
df_data["Content"] = df_data["Content"].apply(remove_punctuations_and_more)
df_data["Content"] = df_data["Content"].apply(replace_num_url)
df_data["Content"] = df_data["Content"].apply(strip_text)

In [5]:
#train test split using sklearn
from sklearn.model_selection import train_test_split
df_train, df_test= train_test_split(df_data, test_size=0.2, random_state=42, stratify=df_data["label"])


In [6]:
text = df_train["Content"].sum()
corpus = list(set(text.split(" ")))
corpus = [x for x in corpus if len(x)   > 2]  # filter out short words
corpus_len = [len(x) for x in corpus]
print(max(corpus_len), min(corpus_len), sum(corpus_len)/len(corpus_len))  # max, min, average length of words in corpus
print("Total unique words in corpus:", len(corpus))  # total unique words in corpus


31 3 7.497684586634071
Total unique words in corpus: 42541


In [7]:
import numpy as np
def create_features_from_corpus(df_, corpus, y_col = None):

    for word in corpus:
        df_[word] = np.where(df_["Content"].str.contains(word), 1, 0)
    if y_col is None:
        return df_[corpus]
    return df_[corpus + [y_col]]

In [10]:
#ignore warnings
import warnings
warnings.filterwarnings("ignore")
#creating features from corpus for train and test dataframes
df_train_ = create_features_from_corpus(df_train, corpus, y_col="label_num")
df_test_ = create_features_from_corpus(df_test, corpus, y_col="label_num")
#saving both dataframes in data folder as parquet files
df_train_.to_parquet("./data/spam_ham_train.parq", index=False)
df_test_.to_parquet("./data/spam_ham_test.parq", index=False)


In [11]:
#reading the train and test dataframes in parquet
df_train_ = pd.read_parquet("./data/spam_ham_train.parq")
df_test_ = pd.read_parquet("./data/spam_ham_test.parq")

In [12]:
X_train = df_train_[corpus]
X_test = df_test_[corpus]
y_train = df_train_["label_num"]
y_test = df_test_["label_num"]

# Traning a Random Forest

In [13]:
def model_evaluation(y_true, y_pred):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average=None)[1]
    recall = recall_score(y_true, y_pred, average=None)[1]
    f1 = f1_score(y_true, y_pred, average=None)[1]
    #print confusion matrix
    from sklearn.metrics import confusion_matrix
    cm = pd.DataFrame(confusion_matrix(y_true, y_pred), 
                      columns = ["Predicted Ham", "Predicted Spam"], index=["Actual Ham", "Actual Spam"])
    print("Confusion Matrix:")
    print(cm)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

In [14]:
#trainng a random forest 
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200,  max_depth= 11, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
model_evaluation(y_test, y_pred)


Confusion Matrix:
             Predicted Ham  Predicted Spam
Actual Ham             730               5
Actual Spam            125             175
Accuracy: 0.8744
Precision: 0.9722
Recall: 0.5833
F1 Score: 0.7292


In [None]:
# using PCA for dimensionality reduction
from sklearn.decomposition import PCA
pca = PCA(n_components=500)  # reducing to 100 components
X_train_pca = pca.fit_transform(X_train)
# calculating the explained variance ratio
print("Explained variance ratio by PCA:", sum(pca.explained_variance_ratio_))
X_test_pca = pca.transform(X_test)

# training the model again with PCA transformed data
rf_pca = RandomForestClassifier(n_estimators=200, max_depth=11, random_state=42)
rf_pca.fit(X_train_pca, y_train)
y_pred_pca = rf_pca.predict(X_test_pca)
model_evaluation(y_test, y_pred_pca)


Explained variance ratio by PCA: 0.7865300366664402


In [None]:
#trainng a SVM with linear kernel
from sklearn.svm import SVC
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
model_evaluation(y_test, y_pred_svm)


Confusion Matrix:
             Predicted Ham  Predicted Spam
Actual Ham             707              28
Actual Spam             29             271
Accuracy: 0.9449
Precision: 0.9064
Recall: 0.9033
F1 Score: 0.9048


In [None]:
#train a Logistic Regression model with lasso 
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
model_evaluation(y_test, y_pred_log_reg)

Confusion Matrix:
             Predicted Ham  Predicted Spam
Actual Ham             710              25
Actual Spam             13             287
Accuracy: 0.9633
Precision: 0.9199
Recall: 0.9567
F1 Score: 0.9379


In [None]:
# train a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
model_evaluation(y_test, y_pred_nb)


Confusion Matrix:
             Predicted Ham  Predicted Spam
Actual Ham             716              19
Actual Spam             40             260
Accuracy: 0.9430
Precision: 0.9319
Recall: 0.8667
F1 Score: 0.8981


In [None]:
#train a gradient boosting algorithm using xgboost
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=200, max_depth=11, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
model_evaluation(y_test, y_pred_xgb)

Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x1635ea900>>
Traceback (most recent call last):
  File "/Users/raghavsharma/anaconda3/envs/python_env/lib/python3.12/site-packages/xgboost/core.py", line 585, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument

KeyboardInterrupt: 
