In [1]:
import warnings
import re
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB as MNB, ComplementNB as CNB
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline

BALANCED_DATA_CSV = 'data/balanced.csv'
ORIGINAL_DATA_CSV = 'data/original.csv'
TEST_CSV = 'data/test.csv'

warnings.filterwarnings('ignore')

class BaseNB:
    def __init__(self):
        self.gs = None

    def fit(self, x_train, y_train):
        self.gs.fit(x_train, y_train)

    def predict(self, x_test):
        return self.gs.predict(x_test)
    

class MultinomialNB(BaseNB):
    def __init__(self):
        super().__init__()
        self.gs = GridSearchCV(
            make_pipeline(TfidfVectorizer(), MNB()), 
            param_grid={
                'multinomialnb__fit_prior': [True, False],  
                'multinomialnb__force_alpha': [True, False],
                'multinomialnb__alpha': [i/10 for i in range(1, 30)]
            }, return_train_score=True
        )

class TextClassifier:
    def __init__(self, naive_bayes_model):
        self.labelEncoder = LabelEncoder()
        self.model = naive_bayes_model

    def _transform_text(self, text_array):
        def to_lower(text):
            return text.lower()
        
        def remove_spaces(text):
            return " ".join(text.split()).strip()
        
        def sanatize(text):
            return re.sub('[.:;/$@&*\'"]', '', text) 
        
        return [to_lower(remove_spaces(sanatize(text))) for text in text_array]
    
    def train(self, csv_file, test_size=0.3):
        df = pd.read_csv(csv_file, header=0)

        x = self._transform_text(df[df.columns[0]])
        y = self.labelEncoder.fit_transform(df[df.columns[1]])

        xtrain, xtest, ytrain, ytest = x, x, y, y
        if (test_size>0):
            xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=test_size)
        
        self.model.fit(xtrain, ytrain)

        print("Training complete.")
        print("Best parameters:", self.model.gs.best_params_)
        print("Score on validation set:", self.model.gs.score(xtest, ytest))

        return self.model.gs.best_params_, self.model.gs.score(xtest, ytest)

    def test(self, csv_file):
        if not self.model.gs:
            raise ValueError("Model has not been trained yet.")

        df = pd.read_csv(csv_file, header=0)
        x_test = self._transform_text(df[df.columns[0]])
        y_test = df[df.columns[1]]

        y_test_pred = self.model.predict(x_test)

        print("Testing complete.")
        print("Best parameters:", self.model.gs.best_params_)
        print("Score on test set:", self.model.gs.score(x_test, self.labelEncoder.transform(y_test)))

        table = pd.DataFrame(
            zip(x_test, [self.labelEncoder.classes_[result] for result in y_test_pred], y_test),
            columns=["Input", "Prediction", "Expected"]
        )
        print("Results: ")

        return table

## Train with balanced data

In [2]:
classifier = TextClassifier(MultinomialNB())
classifier.train(BALANCED_DATA_CSV)
display(classifier.test(TEST_CSV))
print()
classifier = TextClassifier(MultinomialNB())
classifier.train(BALANCED_DATA_CSV, test_size=0)
display(classifier.test(TEST_CSV))

Training complete.
Best parameters: {'multinomialnb__alpha': 1.4, 'multinomialnb__fit_prior': False, 'multinomialnb__force_alpha': True}
Score on validation set: 0.6111111111111112
Testing complete.
Best parameters: {'multinomialnb__alpha': 1.4, 'multinomialnb__fit_prior': False, 'multinomialnb__force_alpha': True}
Score on test set: 0.45454545454545453
Results: 


Unnamed: 0,Input,Prediction,Expected
0,todo refactor this method to reduce complexity,non-negative,non-negative
1,fixme this implementation causes memory leaks,non-negative,negative
2,"this approach works, but might not scale well",negative,negative
3,todo add proper error handling here,negative,non-negative
4,this code could be optimized for better perfor...,negative,non-negative
5,fixme potential race condition in multi-thread...,non-negative,negative
6,"this function is currently hard-coded, needs t...",negative,non-negative
7,need to improve logging for better debugging,non-negative,non-negative
8,fixme this workaround might break in future ve...,negative,negative
9,todo clean up deprecated api usage,negative,non-negative



Training complete.
Best parameters: {'multinomialnb__alpha': 2.7, 'multinomialnb__fit_prior': False, 'multinomialnb__force_alpha': True}
Score on validation set: 0.9715719063545151
Testing complete.
Best parameters: {'multinomialnb__alpha': 2.7, 'multinomialnb__fit_prior': False, 'multinomialnb__force_alpha': True}
Score on test set: 0.5454545454545454
Results: 


Unnamed: 0,Input,Prediction,Expected
0,todo refactor this method to reduce complexity,non-negative,non-negative
1,fixme this implementation causes memory leaks,non-negative,negative
2,"this approach works, but might not scale well",negative,negative
3,todo add proper error handling here,non-negative,non-negative
4,this code could be optimized for better perfor...,negative,non-negative
5,fixme potential race condition in multi-thread...,non-negative,negative
6,"this function is currently hard-coded, needs t...",negative,non-negative
7,need to improve logging for better debugging,non-negative,non-negative
8,fixme this workaround might break in future ve...,negative,negative
9,todo clean up deprecated api usage,negative,non-negative


## Train with full data

In [3]:
classifier = TextClassifier(MultinomialNB())
classifier.train(ORIGINAL_DATA_CSV)
display(classifier.test(TEST_CSV))
print()
classifier = TextClassifier(MultinomialNB())
classifier.train(ORIGINAL_DATA_CSV, test_size=0)
display(classifier.test(TEST_CSV))

Training complete.
Best parameters: {'multinomialnb__alpha': 1.8, 'multinomialnb__fit_prior': False, 'multinomialnb__force_alpha': True}
Score on validation set: 0.7023411371237458
Testing complete.
Best parameters: {'multinomialnb__alpha': 1.8, 'multinomialnb__fit_prior': False, 'multinomialnb__force_alpha': True}
Score on test set: 0.5454545454545454
Results: 


Unnamed: 0,Input,Prediction,Expected
0,todo refactor this method to reduce complexity,non-negative,non-negative
1,fixme this implementation causes memory leaks,non-negative,negative
2,"this approach works, but might not scale well",non-negative,negative
3,todo add proper error handling here,non-negative,non-negative
4,this code could be optimized for better perfor...,non-negative,non-negative
5,fixme potential race condition in multi-thread...,non-negative,negative
6,"this function is currently hard-coded, needs t...",negative,non-negative
7,need to improve logging for better debugging,non-negative,non-negative
8,fixme this workaround might break in future ve...,non-negative,negative
9,todo clean up deprecated api usage,non-negative,non-negative



Training complete.
Best parameters: {'multinomialnb__alpha': 1.6, 'multinomialnb__fit_prior': False, 'multinomialnb__force_alpha': True}
Score on validation set: 0.8450704225352113
Testing complete.
Best parameters: {'multinomialnb__alpha': 1.6, 'multinomialnb__fit_prior': False, 'multinomialnb__force_alpha': True}
Score on test set: 0.6363636363636364
Results: 


Unnamed: 0,Input,Prediction,Expected
0,todo refactor this method to reduce complexity,non-negative,non-negative
1,fixme this implementation causes memory leaks,non-negative,negative
2,"this approach works, but might not scale well",negative,negative
3,todo add proper error handling here,non-negative,non-negative
4,this code could be optimized for better perfor...,non-negative,non-negative
5,fixme potential race condition in multi-thread...,non-negative,negative
6,"this function is currently hard-coded, needs t...",negative,non-negative
7,need to improve logging for better debugging,non-negative,non-negative
8,fixme this workaround might break in future ve...,negative,negative
9,todo clean up deprecated api usage,negative,non-negative
