# Install sklearn

In [38]:
!pip install -U scikit-learn
!pip3 install -U scikit-learn

zsh:1: command not found: pip


In [116]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

data = pd.read_csv('trg.csv')
vectorizer = CountVectorizer()
matrix = vectorizer.fit_transform(data.abstract)

data = data.rename(columns={'class': 'class_label'})
X = data.drop('class_label', axis=1)
y = data['class_label']
data  = pd.DataFrame(data= matrix.toarray(), columns = vectorizer.get_feature_names_out())
print(X)
# data.head()

        id                                           abstract
0        1  the 4 202 353 bp genome of the alkaliphilic ba...
1        2  the complete 1751377-bp sequence of the genome...
2        3  in 1992 we started assembling an ordered libra...
3        4  the aim of this study is to measure human mito...
4        5  the amino acid sequence of the spirulina maxim...
...    ...                                                ...
3995  3996  we have isolated and characterized two diureti...
3996  3997  myotonias are muscle diseases in which the fun...
3997  3998  cysteine synthase o-acetylserine sulfhydrylase...
3998  3999  a region of 25 nucleotides is highly conserved...
3999  4000  thermoanaerobacter tengcongensis is a rod-shap...

[4000 rows x 2 columns]


## Standard Naive Bayes Model

In [117]:
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X['abstract'], y, test_size=0.1, random_state=42)

class NaiveBayes:
    def __init__(self):
        self.word_counts = defaultdict(lambda: defaultdict(int))
        self.class_counts = defaultdict(int)
        self.vocab = set()
        self.class_priors = {}

    def fit(self, X, y):
        self.classes = np.unique(y)
        self.vocab = set()
        for cls in self.classes:
            self.class_counts[cls] = sum(1 for label in y if label == cls)
        for abstract, label in zip(X, y):
            for word in set(abstract.split()):
                self.word_counts[label][word] += 1
                self.vocab.add(word)
        self.vocab_size = len(self.vocab)
        total_docs = len(X)
        self.class_priors = {cls: self.class_counts[cls] / total_docs for cls in self.classes}

    def predict(self, documents):
        predictions = []
        for document in documents:
            posterior_probs = {cls: np.log(self.class_priors[cls]) for cls in self.classes}
            document_words = document.split()
            word_counts = {word: document_words.count(word) for word in set(document_words)}
            for word, count in word_counts.items():
                for cls in self.classes:
                    word_count_in_class = self.word_counts[cls].get(word, 0)
                    total_words_in_class = sum(self.word_counts[cls].values())
                    conditional_prob = (word_count_in_class + 1) / (total_words_in_class + self.vocab_size)
                    posterior_probs[cls] += count * np.log(conditional_prob)
            predicted_class = max(posterior_probs, key=posterior_probs.get)
            predictions.append(predicted_class)
        return predictions

nb_clf = NaiveBayes()
nb_clf.fit(X_train, y_train)
y_pred = nb_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.93


# Improved with preprocessing

In [118]:
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X['abstract'], y, test_size=0.1, random_state=42)

class NaiveBayes:
    def __init__(self):
        self.word_counts = defaultdict(lambda: defaultdict(int))
        self.class_counts = defaultdict(int)
        self.vocab = set()
        self.class_priors = {}

    def preprocess(self, text):
        words = text.split()
        processed_words = []
        skip_next = False
        skip_next_next = False

        for i, word in enumerate(words):
            if skip_next:
                skip_next = False
                if skip_next_next:
                    skip_next = True
                    skip_next_next = False
                continue

            if word == "homo" and i + 1 < len(words) and words[i + 1] == "sapiens":
                processed_words.append("homo-sapiens")
                skip_next = True
                continue

            if word == "escherichia" and i + 1 < len(words) and words[i + 1] == "coli":
                processed_words.append("escherichia-coli")
                skip_next = True
                continue

            if word == "human" and i + 2 < len(words) and words[i + 1] == "immunodeficiency" and words[i + 2] == "virus":
                processed_words.append("human-immunodeficiency-virus")
                skip_next = True
                skip_next_next = True
                continue

            processed_words.append(word)

        return " ".join(processed_words)

    def fit(self, X, y):
        self.classes = np.unique(y)
        self.vocab = set()

        for cls in self.classes:
            self.class_counts[cls] = sum(1 for label in y if label == cls)

        for abstract, label in zip(X, y):
            abstract = self.preprocess(abstract)
            for word in set(abstract.split()):
                self.word_counts[label][word] += 1
                self.vocab.add(word)

        self.vocab_size = len(self.vocab)
        total_docs = len(X)
        self.class_priors = {cls: self.class_counts[cls] / total_docs for cls in self.classes}

    def predict(self, documents):
        predictions = []
        for document in documents:
            document = self.preprocess(document)
            posterior_probs = {cls: np.log(self.class_priors[cls]) for cls in self.classes}

            document_words = document.split()
            word_counts = {word: document_words.count(word) for word in set(document_words)}

            for word, count in word_counts.items():
                for cls in self.classes:
                    word_count_in_class = self.word_counts[cls].get(word, 0)
                    total_words_in_class = sum(self.word_counts[cls].values())
                    conditional_prob = (word_count_in_class + 1) / (total_words_in_class + self.vocab_size)
                    posterior_probs[cls] += count * np.log(conditional_prob)

            predicted_class = max(posterior_probs, key=posterior_probs.get)
            predictions.append(predicted_class)

        return predictions

nb_clf = NaiveBayes()
nb_clf.fit(X_train, y_train)
y_pred = nb_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.93


## Used on tst.csv to for kaggle

In [148]:
# import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Load the data
data = pd.read_csv('tst.csv')

vectorizer = CountVectorizer()
matrix = vectorizer.fit_transform(data.abstract)
X = data.drop('id', axis=1)
print(X)

nb_clf = NaiveBayes()
y_pred = nb_clf.predict(X['abstract'])

df_predictions = pd.DataFrame({'id': range(1, len(y_pred) + 1), 'class': y_pred})
df_predictions.to_csv('predictions.csv', index=False)



                                              abstract
0    in a previous work all three components of com...
1    we compared morphology of two geographically c...
2    factor xiii mr 320000 is a blood coagulation f...
3    we report the characterisation of a human gene...
4    fat tissue plays a critical role in the regula...
..                                                 ...
995  the molecular chaperonins such as groel are no...
996  the cdna sequence of the flavoprotein subunit ...
997  the higher plant arabidopsis thaliana arabidop...
998  the hyperthermophilic euryarchaeon pyrococcus ...
999  the complete mitochondrial dna mtdna molecule ...

[1000 rows x 1 columns]


AttributeError: 'NaiveBayes' object has no attribute 'classes'