In [5]:
!pip install -U scikit-learn
!pip3 install -U scikit-learn

zsh:1: command not found: pip
Collecting scikit-learn
  Downloading scikit_learn-1.4.2-cp312-cp312-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.4.2-cp312-cp312-macosx_12_0_arm64.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading joblib-1.4.2-py3-none-any.whl (301 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.4.2 threadpoolctl-3.5.0


In [6]:
import numpy as np
import pandas as pd

In [32]:
df = pd.read_csv('trg.csv')
df.head()

Unnamed: 0,id,class,abstract
0,1,B,the 4 202 353 bp genome of the alkaliphilic ba...
1,2,A,the complete 1751377-bp sequence of the genome...
2,3,E,in 1992 we started assembling an ordered libra...
3,4,E,the aim of this study is to measure human mito...
4,5,B,the amino acid sequence of the spirulina maxim...


## Naive Bayes Model

In [41]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from collections import defaultdict

def preprocess_text(text):
    """
    Preprocess the text by converting it to lowercase and splitting it into words.
    """
    return text.lower().split()

def create_vocabulary(X):
    """
    Create a vocabulary (set of unique words) from the input text data.
    """
    vocab = set()
    for document in X:
        vocab.update(document)
    return vocab

def document_to_vector(document, vocabulary):
    """
    Convert a document into a vector representation based on the vocabulary.
    """
    vector = [0] * len(vocabulary)
    for word in document:
        if word in vocabulary:
            vector[list(vocabulary).index(word)] = 1
    return vector

class NaiveBayes:
    def __init__(self):
        self.vocabulary = None
        self.class_priors = {}
        self.feature_prob = {}
        self.encoder = None

    def fit(self, X, y):
        # Preprocess the text data
        X = [preprocess_text(text) for text in X]

        # Create the vocabulary
        self.vocabulary = create_vocabulary(X)

        # Encode labels
        self.encoder = LabelEncoder()
        y = self.encoder.fit_transform(y)

        # Calculate class priors
        total_instances = len(y)
        unique_classes, class_counts = np.unique(y, return_counts=True)
        self.class_priors = dict(zip(unique_classes, class_counts / total_instances))

        # Calculate feature probabilities
        self.feature_prob = {c: defaultdict(lambda: [0, 0]) for c in unique_classes}

        for c in unique_classes:
            class_indices = [i for i, label in enumerate(y) if label == c]
            for i in class_indices:
                for word in X[i]:
                    word_idx = list(self.vocabulary).index(word)
                    self.feature_prob[c][word_idx][1] += 1
                for word_idx in range(len(self.vocabulary)):
                    if self.feature_prob[c][word_idx][1] == 0:
                        self.feature_prob[c][word_idx][0] += 1

        # Normalize feature probabilities
        for c in self.feature_prob:
            total_features = sum(sum(counts) for counts in self.feature_prob[c].values())
            for feature, counts in self.feature_prob[c].items():
                self.feature_prob[c][feature] = [(count + 1) / (total_features + 2) for count in counts]

    def predict(self, X):
        # Preprocess the text data
        X = [preprocess_text(text) for text in X]

        # Convert text data to vector representation
        X = [document_to_vector(document, self.vocabulary) for document in X]

        predictions = []
        for instance in X:
            class_prob = {}
            for c in self.class_priors:
                class_prob[c] = self.class_priors[c]
                for feature_idx, value in enumerate(instance):
                    class_prob[c] *= self.feature_prob[c][feature_idx][value]
            predictions.append(self.encoder.inverse_transform([max(class_prob, key=class_prob.get)])[0])
        return predictions

if __name__ == '__main__':
    train_data = pd.DataFrame(pd.read_csv('trg.csv'), columns=['abstract', 'class'])
    print("Column names in train_data:", train_data.columns)

    X_train = train_data['abstract']
    y_train = train_data['class']

    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    nb = NaiveBayes()
    nb.fit(X_train, y_train)
    y_pred_val = nb.predict(X_val)

    # Evaluate the model's performance
    y_val_enc = nb.encoder.transform(y_val)
    print(f'Validation Accuracy: {accuracy_score(y_val_enc, nb.encoder.transform(y_pred_val))}')

Column names in train_data: Index(['abstract', 'class'], dtype='object')


In [38]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from collections import defaultdict

class NaiveBayes:
    def __init__(self):
        self.class_priors = {}
        self.feature_prob = {}
        self.encoder = None
        self.vectorizer = None

    def fit(self, X, y):
        self.encoder = LabelEncoder()
        y = self.encoder.fit_transform(y)

        # Calculate class priors
        total_instances = len(y)
        unique_classes, class_counts = np.unique(y, return_counts=True)
        self.class_priors = dict(zip(unique_classes, class_counts / total_instances))

        # Convert text data to TF-IDF representation
        self.vectorizer = TfidfVectorizer()
        X = self.vectorizer.fit_transform(X)

        # Calculate feature probabilities
        self.feature_prob = {c: defaultdict(lambda: [0, 0]) for c in unique_classes}

        for c in unique_classes:
            class_indices = [i for i, label in enumerate(y) if label == c]
            for i in class_indices:
                for feature_idx, value in X[i].nonzero()[1]:
                    self.feature_prob[c][feature_idx][int(value != 0)] += 1

        # Normalize feature probabilities
        for c in self.feature_prob:
            total_features = sum(sum(counts) for counts in self.feature_prob[c].values())
            for feature, counts in self.feature_prob[c].items():
                self.feature_prob[c][feature] = [(count + 1) / (total_features + 2) for count in counts]

    def predict(self, X):
        # Convert text data to TF-IDF representation
        X = self.vectorizer.transform(X)

        predictions = []
        for instance in X:
            class_prob = {}
            for c in self.class_priors:
                class_prob[c] = self.class_priors[c]
                for feature_idx, value in instance.nonzero()[1]:
                    class_prob[c] *= self.feature_prob[c][feature_idx][int(value != 0)]
            predictions.append(self.encoder.inverse_transform([max(class_prob, key=class_prob.get)])[0])
        return predictions

if __name__ == '__main__':
    train_data = pd.DataFrame(pd.read_csv('trg.csv'), columns=['abstract', 'class'])
    print("Column names in train_data:", train_data.columns)

    X_train = train_data['abstract']
    y_train = train_data['class']

    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    nb = NaiveBayes()
    nb.fit(X_train, y_train)
    y_pred_val = nb.predict(X_val)

    # Evaluate the model's performance
    y_val_enc = nb.encoder.transform(y_val)
    print(f'Validation Accuracy: {accuracy_score(y_val_enc, nb.encoder.transform(y_pred_val))}')

Column names in train_data: Index(['abstract', 'class'], dtype='object')


TypeError: cannot unpack non-iterable numpy.int32 object