In [1]:
class NaiveBayesClassifier:
    def __init__(self, alpha=1):
        self.alpha = alpha
        self.class_probs = None
        self.word_probs = None

In [2]:
    def fit(self, X, y):
        num_docs, vocab_size = X.shape
        unique_classes = np.unique(y)
        num_classes = len(unique_classes)

        # Calculate class probabilities
        self.class_probs = np.zeros(num_classes)
        for i, c in enumerate(unique_classes):
            self.class_probs[i] = np.sum(y == c) / num_docs

        # Calculate word probabilities with Laplace smoothing
        self.word_probs = np.zeros((num_classes, vocab_size))
        for i, c in enumerate(unique_classes):
            class_docs = X[y == c]
            total_words_in_class = np.sum(class_docs)
            self.word_probs[i] = (np.sum(class_docs, axis=0) + self.alpha) / (total_words_in_class + self.alpha * vocab_size)

In [3]:
def predict_proba(self, X):
        num_docs, _ = X.shape
        num_classes, vocab_size = self.word_probs.shape

        # Use log probabilities to avoid underflow
        log_class_probs = np.log(self.class_probs)
        log_word_probs = np.log(self.word_probs)

        # Calculate log likelihoods for each class
        log_likelihoods = np.zeros((num_docs, num_classes))
        for i in range(num_docs):
            doc = X[i].toarray().flatten()  # Convert the sparse matrix to a dense array
            log_likelihoods[i] = np.sum(log_word_probs * doc, axis=1) + log_class_probs

        # Convert log likelihoods to probabilities using softmax
        exp_log_likelihoods = np.exp(log_likelihoods - np.max(log_likelihoods, axis=1, keepdims=True))
        probabilities = exp_log_likelihoods / np.sum(exp_log_likelihoods, axis=1, keepdims=True)

        return probabilities

In [10]:
def predict(self, X):
        num_docs, _ = X.shape
        num_classes, vocab_size = self.word_probs.shape

        # Use log probabilities to avoid underflow
        log_class_probs = np.log(self.class_probs)
        log_word_probs = np.log(self.word_probs)

        # Calculate log likelihoods for each class
        log_likelihoods = np.zeros((num_docs, num_classes))
        for i in range(num_docs):
            doc = X[i].toarray().flatten()  # Convert the sparse matrix to a dense array
            log_likelihoods[i] = np.sum(log_word_probs * doc, axis=1) + log_class_probs

        # Predict the class with the highest log likelihood
        predictions = np.argmax(log_likelihoods, axis=1)
        return predictions

In [11]:
import pandas as pd
import numpy as np
import spacy
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

#Setting up stop words and punctuation while loading the Spacy model.
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

def spacy_tokenizer(sentence, count_words=False):
    if type(sentence) == float:
        sentence = sentence
    else:
        doc = nlp(sentence)
        mytokens = [word.lemma_.lower().strip() for word in doc]
        mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]
        sentence = " ".join(mytokens)

    if count_words:
        word_count = len(mytokens)
        return sentence, word_count
    else:
        return sentence

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [6]:
# Load data
data = pd.read_csv("llm-detect-ai-generated-text/train_essays.csv")

# Tokenize and preprocess the text data
data[['tokenized_Response', 'count']] = data['text'].apply(lambda x: pd.Series(spacy_tokenizer(x, count_words=True)))

NameError: name 'pd' is not defined

In [None]:
# Create a vocabulary and reverse the index.
vectorizer = CountVectorizer()
X_vec = vectorizer.fit_transform(data['tokenized_Response'])
vocab = vectorizer.get_feature_names_out() 
reverse_index = {word: index for index, word in enumerate(vocab)}

# Determine the likelihood of happening.
total_documents = len(data)
word_occurrences = np.array(X_vec.sum(axis=0)).flatten()

occurrence_probs = word_occurrences / total_documents

# Determine the conditional probability based on the classification (human or LLM).
llm_indices = data[data['generated'] == 1].index
human_indices = data[data['generated'] == 0].index
llm_documents = len(llm_indices)
human_documents = len(human_indices)

llm_word_occurrences = np.array(X_vec[llm_indices].sum(axis=0)).flatten()
human_word_occurrences = np.array(X_vec[human_indices].sum(axis=0)).flatten()

llm_probs = llm_word_occurrences / llm_documents
human_probs = human_word_occurrences / human_documents

In [None]:
# Naive Bayes Classifier Implementation
class NaiveBayesClassifier:
    def __init__(self, alpha=1):
        self.alpha = alpha
        self.class_probs = None
        self.word_probs = None

    def fit(self, X, y):
        num_docs, vocab_size = X.shape
        unique_classes = np.unique(y)
        num_classes = len(unique_classes)

        # Calculate class probabilities
        self.class_probs = np.zeros(num_classes)
        for i, c in enumerate(unique_classes):
            self.class_probs[i] = np.sum(y == c) / num_docs

        # Calculate word probabilities with Laplace smoothing
        self.word_probs = np.zeros((num_classes, vocab_size))
        for i, c in enumerate(unique_classes):
            class_docs = X[y == c]
            total_words_in_class = np.sum(class_docs)
            self.word_probs[i] = (np.sum(class_docs, axis=0) + self.alpha) / (total_words_in_class + self.alpha * vocab_size)

            
    def predict_proba(self, X):
        num_docs, _ = X.shape
        num_classes, vocab_size = self.word_probs.shape

        # Use log probabilities to avoid underflow
        log_class_probs = np.log(self.class_probs)
        log_word_probs = np.log(self.word_probs)

        # Calculate log likelihoods for each class
        log_likelihoods = np.zeros((num_docs, num_classes))
        for i in range(num_docs):
            doc = X[i].toarray().flatten()  # Convert the sparse matrix to a dense array
            log_likelihoods[i] = np.sum(log_word_probs * doc, axis=1) + log_class_probs

        # Convert log likelihoods to probabilities using softmax
        exp_log_likelihoods = np.exp(log_likelihoods - np.max(log_likelihoods, axis=1, keepdims=True))
        probabilities = exp_log_likelihoods / np.sum(exp_log_likelihoods, axis=1, keepdims=True)

        return probabilities        
    
    def predict(self, X):
        num_docs, _ = X.shape
        num_classes, vocab_size = self.word_probs.shape

        # Use log probabilities to avoid underflow
        log_class_probs = np.log(self.class_probs)
        log_word_probs = np.log(self.word_probs)

        # Calculate log likelihoods for each class
        log_likelihoods = np.zeros((num_docs, num_classes))
        for i in range(num_docs):
            doc = X[i].toarray().flatten()  # Convert the sparse matrix to a dense array
            log_likelihoods[i] = np.sum(log_word_probs * doc, axis=1) + log_class_probs

        # Predict the class with the highest log likelihood
        predictions = np.argmax(log_likelihoods, axis=1)
        return predictions

In [None]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, stratify=y)

In [None]:
alpha_value = 1
custom_nb_classifier = NaiveBayesClassifier(alpha=alpha_value)
custom_nb_classifier.fit(x_train, y_train)

In [None]:
y_pred_custom = custom_nb_classifier.predict(x_dev)

In [None]:
print("Classification Report on Dev Set:")
print(classification_report(y_pred_custom, y_dev))

In [None]:
# Compare the effect of Smoothing
alpha_values = [0.1, 0.5, 1, 2, 5]
for alpha in alpha_values:
    custom_nb_classifier = NaiveBayesClassifier(alpha=alpha)
    custom_nb_classifier.fit(x_train, y_train)
    y_pred_custom = custom_nb_classifier.predict(x_dev)
    accuracy = np.mean(y_pred_custom == y_dev)
    print(f"Accuracy with Smoothing (alpha={alpha}): {accuracy}")

In [None]:
# Derive Top 10 words that predict each class
top_words_llm = [vocab[i] for i in np.argsort(llm_probs)[-10:]]
top_words_human = [vocab[i] for i in np.argsort(human_probs)[-10:]]

print("Top 10 words predicting LLM:")
print(top_words_llm)
print("\nTop 10 words predicting Human:")
print(top_words_human)

# Using the test dataset
test_df = pd.read_csv('llm-detect-ai-generated-text/test_essays.csv')
test_df['tokenized_Response'] = test_df['text'].apply(spacy_tokenizer)
test_vec = vectorizer.transform(test_df['tokenized_Response'])

In [None]:
submit = pd.read_csv('llm-detect-ai-generated-text/sample_submission.csv')

# custom_nb_classifier is the trained Naive Bayes classifier
for i in range(len(submit)):
    output_arr = custom_nb_classifier.predict_proba(test_vec[i])
    submit.iloc[i, 1] = output_arr[0][0]

submit.to_csv('submission.csv', index=False)
submit.head()