In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy.sparse import csr_matrix
# from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import re
import string


from bs4 import BeautifulSoup
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPooling1D
import re
import string

from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


In [2]:
# Read in data from CSV files
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
# Drop subject/date and concatenating text and title
train["text"] = train["title"] + " " + train["text"]
test["text"] = test["title"] + " " + test["text"]

train = train.drop(["subject", "date", "title"], axis = 1)
test = test.drop(["subject", "date", "title"], axis = 1)

In [4]:
# Define a function to clean the text data
def clean_text_data(data_point):
    # Use BeautifulSoup to parse the input text
    review_soup = BeautifulSoup(data_point)

    #Extract the text and do cleaning of data
    review_text = review_soup.get_text()
    review_letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    review_lower_case = review_letters_only.lower()  
    review_words = review_lower_case.split() 
    stop_words = stopwords.words("english")
    meaningful_words = [x for x in review_words if x not in stop_words]

    # Join the meaningful words back into a single string and return the result
    return(" ".join(meaningful_words)) 

train["text"] = train["text"].apply(clean_text_data)
test["text"] = test["text"].apply(clean_text_data)
data = pd.concat([train, test], ignore_index=True)



In [5]:
# Split the data into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(train["text"], train["label"], test_size=0.2, random_state=42)

# Convert train/test data to vectors
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000) 

# Convert the text into bag-of-words features
vectorizer = CountVectorizer()
vectorizer.fit(data["text"])
X_train = vectorizer.transform(X_train)
X_val = vectorizer.transform(X_val)

# Convert the sparse matrices to dense numpy arrays and add an extra dimension
X_train_array = np.expand_dims(X_train.toarray(), axis=-1)
X_val_array = np.expand_dims(X_val.toarray(), axis=-1)

# Convert the labels to numpy arrays
Y_train = Y_train.to_numpy()
Y_val = Y_val.to_numpy()

In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
Y_train = le.fit_transform(Y_train)
Y_val = le.transform(Y_val)

In [12]:
#MULTINOMIAL NAIVE BAYES CLASS USED FOR ML MODEL
class MultinomialNaiveBayes:
    # Initialize the Multinomial Naive Bayes classifier with a smoothing parameter alpha
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y):
        # Determine the unique classes and their count
        self.classes = np.unique(y)
        self.num_classes = len(self.classes)

        # Initialize arrays for feature counts and class counts
        self.feature_counts = np.zeros((self.num_classes, X.shape[1]))
        self.class_counts = np.zeros(self.num_classes)

        # Calculate feature counts and class counts for each class
        for i, c in enumerate(self.classes):
            X_c = X[y == c]
            self.feature_counts[i, :] = np.sum(X_c, axis=0) + self.alpha
            self.class_counts[i] = X_c.shape[0]

        # Compute the log probabilities for features and classes
        self.feature_log_prob = np.log(self.feature_counts / self.feature_counts.sum(axis=1, keepdims=True))
        self.class_log_prior = np.log(self.class_counts / self.class_counts.sum())

    def predict(self, X):
        # Calculate log likelihood for each class, using feature log probabilities and class log prior
        log_likelihood = X @ self.feature_log_prob.T + self.class_log_prior
        # Return the class with the highest log likelihood for each sample
        return self.classes[np.argmax(log_likelihood, axis=1)]

        # # Calculate log likelihood for each class, using feature log probabilities and class log prior
        # log_likelihood = X @ self.feature_log_prob.T + self.class_log_prior
        # # Return the class with the highest log likelihood for each sample
        # return np.argmax(log_likelihood, axis=1)

In [8]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train["text"], train["label"], test_size=0.2, random_state=42)

In [9]:
# Create a CountVectorizer object with sparse=True
vectorizer = CountVectorizer(binary=True, stop_words='english', max_features=5000)

In [10]:
# Fit the vectorizer on the training data
X_train_counts = vectorizer.fit_transform(X_train)

# Transform the validation data using the fitted vectorizer
X_val_counts = vectorizer.transform(X_val)

In [13]:
# Train the Naive Bayes model
naive_bayes = MultinomialNaiveBayes()
naive_bayes.fit(X_train_counts, y_train)

# Validate the Naive Bayes model
y_val_pred = naive_bayes.predict(X_val_counts)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation accuracy:", val_accuracy)

# Evaluate the Naive Bayes model on the test data
X_test_counts = vectorizer.transform(test["text"])
y_test = test["label"]

y_test_pred = naive_bayes.predict(X_test_counts)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test accuracy:", test_accuracy)

Validation accuracy: 0.9605263157894737
Test accuracy: 0.9646


In [14]:
# Calculate recall score for the validation data
val_recall = recall_score(y_val, y_val_pred, average='binary', pos_label='real')
print("Validation recall:", val_recall)

# Calculate recall score for the test data
test_recall = recall_score(y_test, y_test_pred, average='binary', pos_label='real')
print("Test recall:", test_recall)

Validation recall: 0.9618869936034116
Test recall: 0.9368


In [16]:
# Calculate F1 score for the validation data
val_f1 = f1_score(y_val, y_val_pred, average='binary', pos_label='real')
print("Validation F1 score:", val_f1)

# Calculate F1 score for the test data
test_f1 = f1_score(y_test, y_test_pred, average='binary', pos_label='real')
print("Test F1 score:", test_f1)

Validation F1 score: 0.9581839904420549
Test F1 score: 0.9635877391483234
