In [37]:
import warnings
import re
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB as MultiNB, ComplementNB as ComplNB, CategoricalNB as CatNB
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline

BALANCED_DATA_CSV = 'data/balanced.csv'
ORIGINAL_DATA_CSV = 'data/original.csv'
TEST_CSV = 'data/test.csv'

warnings.filterwarnings('ignore')

class BaseNB:
    def __init__(self):
        self.gs = None

    def fit(self, x_train, y_train):
        self.gs.fit(x_train, y_train)

    def predict(self, x_test):
        return self.gs.predict(x_test)
    
    def score(self, x_test, y_test):
        return self.gs.score(x_test, y_test)

class MultinomialNB(BaseNB):
    def __init__(self):
        super().__init__()
        self.gs = GridSearchCV(
            make_pipeline(TfidfVectorizer(), MultiNB()), 
            param_grid={
                'multinomialnb__fit_prior': [True, False],  
                'multinomialnb__force_alpha': [True, False],
                'multinomialnb__alpha': [i/10 for i in range(1, 30)]
            }, return_train_score=True
        )

class ComplementNB(BaseNB):
    def __init__(self):
        super().__init__()
        self.gs = GridSearchCV(
            make_pipeline(TfidfVectorizer(), ComplNB()), 
            param_grid={
                'complementnb__fit_prior': [True, False],  
                'complementnb__force_alpha': [True, False],
                'complementnb__alpha': [i/10 for i in range(1, 30)],
                'complementnb__norm': [True, False],
            }, return_train_score=True
        )

class CategoricalNB(BaseNB):
    """
    This variant doesn't work for this problem because it expects known categories and not random texts.
    """
    def __init__(self):
        super().__init__()
        self.gs = GridSearchCV(
            make_pipeline(OrdinalEncoder(), CatNB()), 
            param_grid={
                'categoricalnb__fit_prior': [True, False],  
                'categoricalnb__force_alpha': [True, False],
                'categoricalnb__alpha': [i/10 for i in range(1, 30)]
            }, return_train_score=True
        )
    
    def fit(self, x_train, y_train):
        return super().fit(np.reshape(x_train, (-1, 1)), y_train)
    
    def predict(self, x_test):
        return super().predict(np.reshape(x_test, (-1, 1)))
    
    def score(self, x_test, y_test):
        return super().score(np.reshape(x_test, (-1, 1)), y_test)

class TextClassifier:
    def __init__(self, naive_bayes_model_cls):
        self.labelEncoder = LabelEncoder()
        self.model = naive_bayes_model_cls()

    def _transform_text(self, text_array):
        def to_lower(text):
            return text.lower()
        
        def remove_spaces(text):
            return " ".join(text.split()).strip()
        
        def sanatize(text):
            return re.sub('[./:;$@&*\'"]', '', text) 
        
        return [to_lower(remove_spaces(sanatize(text))) for text in text_array]

    def train(self, csv_file, test_size=0.3):
        df = pd.read_csv(csv_file, header=0)

        x = self._transform_text(df[df.columns[0]])
        y = self.labelEncoder.fit_transform(df[df.columns[1]])

        xtrain, xtest, ytrain, ytest = x, x, y, y
        if (test_size>0):
            xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=test_size)
        
        self.model.fit(xtrain, ytrain)

        print("Training complete.")
        print("Best parameters:", self.model.gs.best_params_)
        print("Score on validation set:", self.model.score(xtest, ytest))

        return self.model.gs.best_params_, self.model.score(xtest, ytest)

    def test(self, csv_file):
        if not self.model.gs:
            raise ValueError("Model has not been trained yet.")

        df = pd.read_csv(csv_file, header=0)
        x_test = self._transform_text(df[df.columns[0]])
        y_test = df[df.columns[1]]

        y_test_pred = self.model.predict(x_test)

        print("Testing complete.")
        print("Best parameters:", self.model.gs.best_params_)
        print("Score on test set:", self.model.score(x_test, self.labelEncoder.transform(y_test)))

        table = pd.DataFrame(
            zip(x_test, [self.labelEncoder.classes_[result] for result in y_test_pred], y_test),
            columns=["Input", "Prediction", "Expected"]
        )
        print("Results: ")

        return table

In [38]:
import json
import os

CHOICES_FILE = 'previous_choices.json'

def load_previous_choices():
    if os.path.exists(CHOICES_FILE):
        with open(CHOICES_FILE, 'r') as file:
            return json.load(file)
    return {
        'nb_choice': None,
        'train_full': None,
        'data_choice': None
    }

def save_previous_choices(nb_choice, train_full, data_choice):
    choices = {
        'nb_choice': nb_choice,
        'train_full': train_full,
        'data_choice': data_choice
    }
    with open(CHOICES_FILE, 'w') as file:
        json.dump(choices, file)

def get_user_input():
    previous_choices = load_previous_choices()

    if previous_choices['nb_choice'] is not None:
        print("\nDo you want to reuse your previous choices?")
        print("1. Yes (default)")
        print("2. No")
        reuse_choice = input("Enter the number (1/2): ").strip()
        
        if reuse_choice != '2':
            return previous_choices['nb_choice'], previous_choices['train_full'], previous_choices['data_choice']

    print("\nChoose the Naive Bayes variant to use:")
    print("1. Multinomial")
    print("2. Complement")
    print("3. Categorical (don't work for this problem)")
    nb_choice = input("Enter the number (1/2/3): ").strip()

    print("\nDo you want to train with full data?")
    print("1. Yes")
    print("2. No (70% training, 30% testing)")
    full_data_choice = input("Enter the number (1/2): ").strip()

    print("\nWhich dataset do you want to use?")
    print("1. BALANCED")
    print("2. ORIGINAL")
    data_choice = input("Enter the number (1/2): ").strip()

    save_previous_choices(nb_choice, full_data_choice == '1', data_choice)

    return nb_choice, full_data_choice == '1', data_choice

def main():
    nb_choice, train_full, data_choice = get_user_input()

    if nb_choice == '1':
        NBVariantCls = MultinomialNB
    elif nb_choice == '2':
        NBVariantCls = ComplementNB
    elif nb_choice == '3':
        NBVariantCls = CategoricalNB
    else:
        print("Invalid choice for Naive Bayes variant. Exiting...")
        return

    classifier = TextClassifier(NBVariantCls)
    csv_file = BALANCED_DATA_CSV if data_choice == '1' else ORIGINAL_DATA_CSV

    if train_full:
        print("\nTraining with full data...")
        classifier.train(csv_file, test_size=0)  
    else:
        print("\nTraining with 70% data...")
        classifier.train(csv_file, test_size=0.3) 

    print("Testing the model...")
    results = classifier.test(TEST_CSV)
    display(results)

if __name__ == "__main__":
    main()



Do you want to reuse your previous choices?
1. Yes (default)
2. No

Choose the Naive Bayes variant to use:
1. Multinomial
2. Complement
3. Categorical (don't work for this problem)

Do you want to train with full data?
1. Yes
2. No (70% training, 30% testing)

Which dataset do you want to use?
1. BALANCED
2. ORIGINAL
Training with full data...
Training complete.
Best parameters: {'multinomialnb__alpha': 2.7, 'multinomialnb__fit_prior': False, 'multinomialnb__force_alpha': True}
Score on validation set: 0.9715719063545151
Testing the model...
Testing complete.
Best parameters: {'multinomialnb__alpha': 2.7, 'multinomialnb__fit_prior': False, 'multinomialnb__force_alpha': True}
Score on test set: 0.5454545454545454
Results: 


Unnamed: 0,Input,Prediction,Expected
0,todo refactor this method to reduce complexity,non-negative,non-negative
1,fixme this implementation causes memory leaks,non-negative,negative
2,"this approach works, but might not scale well",negative,negative
3,todo add proper error handling here,non-negative,non-negative
4,this code could be optimized for better perfor...,negative,non-negative
5,fixme potential race condition in multi-thread...,non-negative,negative
6,"this function is currently hard-coded, needs t...",negative,non-negative
7,need to improve logging for better debugging,non-negative,non-negative
8,fixme this workaround might break in future ve...,negative,negative
9,todo clean up deprecated api usage,negative,non-negative
