In [1]:
# Standard imports
import numpy as np
from pprint import pprint
import pandas as pd

import spacy

# Sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin


# Built-in library
import itertools
import re
import json
from typing import Union, Optional, Any
from pathlib import Path
import logging
import warnings

warnings.filterwarnings("ignore")

# pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Black code formatter (Optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def extract_text_data(*, filepath: Path, label: int) -> pd.DataFrame:
    """This loads the text data, assigns a label and returns a DF."""
    with open(filepath, "r") as file:
        # Remove empty lines: using len(line) > 5
        data = [(line.strip(), label) for line in file.readlines() if len(line) > 5]
    # Convert to DF
    df = pd.DataFrame(data=data, columns=["text", "label"])
    return df


def remove_punctuations(text: str) -> str:
    import string

    """This returns the text without punctuations"""
    cleaned_text = re.compile(pattern=f"[{re.escape(string.punctuation)}]").sub(
        repl="", string=str(text)
    )
    return cleaned_text

In [3]:
fp = "edgar_allan_poe.txt"
fp1 = "robert_frost.txt"
label_0, label_1 = 0, 1

edgar_allan_poe = extract_text_data(filepath=fp, label=label_0)
robert_frost = extract_text_data(filepath=fp1, label=label_1)

edgar_allan_poe.head()

Unnamed: 0,text,label
0,LO! Death hath rear'd himself a throne,0
1,"In a strange city, all alone,",0
2,Far down within the dim west,0
3,"Where the good, and the bad, and the worst, and the best,",0
4,Have gone to their eternal rest.,0


In [4]:
robert_frost.head()

Unnamed: 0,text,label
0,"Two roads diverged in a yellow wood,",1
1,And sorry I could not travel both,1
2,"And be one traveler, long I stood",1
3,And looked down one as far as I could,1
4,To where it bent in the undergrowth;,1


In [14]:
# Create the data

RANDOM_STATE = 2
TEST_SIZE = 0.1

data = pd.concat([edgar_allan_poe, robert_frost], axis="rows").reset_index(drop=True)
# Remove punctuations
data = data.assign(text=data["text"].apply(remove_punctuations))

data.sample(n=10, random_state=RANDOM_STATE)

Unnamed: 0,text,label
1703,Not bluebells gracing a tunnel mouth,1
1275,Of door and headboard Where it wants to get,1
294,Hath ever told or is it of a thought,0
551,To the Lethean peace of the skies,0
426,Then desolately fall,0
662,That from new fountains overflow,0
2055,Whats this,1
611,Of its own fervour what had oer it power,0
1303,The way a man with one leg and a crutch,1
1641,How shall we,1


### Split The Data Into Train And Validation Sets

In [None]:
X = data["text"]
y = data["label"]

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

X_train.shape, X_valid.shape

In [9]:
from abc import ABC, abstractmethod


class BaseModel(ABC):
    """This is the blueprint used for building ML models."""

    @abstractmethod
    def __repr__(self) -> str:
        """This is the string representation of the model"""
        pass

    def fit(
        self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series]
    ) -> None:
        """This is used to train the model."""
        return self

    def predict(
        self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series]
    ) -> None:
        """This is used to make predictions using new data."""
        pass

In [19]:
data["text"].head()

0                     LO Death hath reard himself a throne
1                              In a strange city all alone
2                             Far down within the dim west
3    Where the good and the bad and the worst and the best
4                          Have gone to their eternal rest
Name: text, dtype: object

### Multi-Nomial Naive Bayes

* **Transition Matrix** $\mathbf{A_{ij}}$: This is the probability of a state, `t`, given a previous state, `t-1`. This is a matrix (2-D array).
  
$$
\mathbf{A_{ij}} = p(s_{t} = j | s_{t-1} = i)
$$

* Predicted/Estimate Transition Matrix

$$
\mathbf{\hat{A}_{ij}} = \frac{count(i \rightarrow j)}{count(i)}
$$

* **Initial State Distribution** $\mathbf{\pi_{i}} $: This is the probability of an initial state in a sequence. This is a vector (1-D array).

$$
\mathbf{\pi_{i}} = p(s_{1} = i)
$$

* Estimated Initial State Distribution

$$
\mathbf{\hat{\pi_{i}}} = \frac{count(s_{1} = i)}{N}
$$


```text
where
N = Number of sequences.
```

$$
\mathbf{p(y|X)} = \frac{p(X|y).p(y)}{p(X)}
$$

```text
where
p(y|X) = Posterior probability
p(X|y) = Class conditional probability or likelihood
p(y) = Prior probability of y
p(X) = Marginal probability of X
```

* Since $p(X)$ does not depend on `y`, we can ignore it.

$$
\mathbf{p(y|X)} = {p(X|y).p(y)}
$$

* If we have `n` features, it becomes:

$$
\mathbf{p(y|X)} = {p(x_{1}|y).p(x_{2}|y)...p(x_{n}|y).p(y)}
$$

* Taking the log, we have:

$$
\mathbf{p(y|X)} = {log(p(x_{1}|y))+log(p(x_{2}|y))+...+log(p(x_{n}|y))+log(p(y)})
$$


**Note**: $\hat{A}_{ij}$ and $\hat{\pi_{i}}$ will be used to model the `class conditional probability`.


<hr>

## Steps Required To Build The Model From Scratch

### Training:

1. Determine the vocabulary and tokenize the documents.
2. Initialize the parameters: $\hat{A}_{ij}$, $\hat{\pi_{i}}$, and $p(y)$ for each class label. \
i.e we need to initialize two variables per parameters since we have two class labels. e.g.  $\hat{A}0_{ij}$,  $\hat{A}1_{ij}$, etc.
3. Compute the count of A and Pi. i.e A_hat and Pi_hat.
4. Calculate the log probabilities of A_hat and Pi_hat.
5. 

### Making Predictions

$$
\mathbf{p(y|X)} = argmax({log(p(x_{1}|y))+log(p(x_{2}|y))+...+log(p(x_{n}|y))+log(p(y)}))
$$

1. Calculate the posteriors. i.e for the classes.
2. Find `y` by calculating the `argmax` of the class given the input over all classes. 

In [122]:
class MultiNomial_NB(BaseModel):
    def __init__(self) -> None:
        self.transition_matrix = None
        self.initial_state_distr = None
        self.priors = None
        self.vocab = {"unk": 0}

    def __repr__(self) -> str:
        return f"{__class__.__name__}()"

    def fit(
        self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series]
    ) -> None:
        return self

    def _init_params(
        self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series]
    ):
        """This is used to initialize the training parameters."""
        A_0, Pi_0 = MultiNomial_NB._init_A_and_Pi(X=X)
        A_1, Pi_1 = MultiNomial_NB._init_A_and_Pi(X=X)
        log_y1, log_y2 = MultiNomial_NB._log_priors(y=y)
        pass

    @staticmethod
    def _init_A_and_Pi(*, X: Union[np.ndarray, pd.Series]) -> tuple:
        """This is used to initialize the state transition matrix
        and the initial state distribution."""
        vocab = MultiNomial_NB._get_vacabulary(documents=X)
        V = len(vocab)
        # Add add-one smoothering
        # A is a matrix and Pi is a vector
        A = np.ones(shape=(V, V), dtype=float)
        Pi = np.ones(shape=(V), dtype=float)
        return (A, Pi)

    @staticmethod
    def _get_vacabulary(*, documents: pd.Series) -> dict:
        """This is used to create the vocabulary of the corpus."""
        # Create a dict that'll be used to store unique words and their integer
        # mappings as key-value pairs. Add a custom token `unk` with a value of 0
        # which will be used for tokens that are not present in the training data.
        vocab = {"unk": 0}
        count = 1

        for doc in documents:
            # Tokenize the document
            tokenized_doc = [x.lower() for x in doc.split()]
            for term in tokenized_doc:
                if term not in vocab:
                    vocab[term] = count
                    count += 1
        return vocab

    @staticmethod
    def _tokenize_document(*, X: Union[np.ndarray, pd.Series]) -> list[int]:
        """This is used to tokenize the documents.
        It returns the tokenized documents as list of integers."""
        tokenized_documents = []
        vocab = MultiNomial_NB._get_vacabulary(documents=X)
        for doc in X:
            # Tokenize the document
            tokenized_doc = [x.lower() for x in doc.split()]
            # Get the integer values of the tokens
            tokenized_doc = [vocab.get(term) for term in tokenized_doc]
            tokenized_documents.append(tokenized_doc)

        return tokenized_documents

    @staticmethod
    def _log_priors(*, y: np.ndarray) -> list[float]:
        """This returns the log priors of y.

        Returns:
            log_probs: A list containing the log probabilities
            of the class labels.
        """
        # Get the counts; calculate the log probabilities
        # using the probabilities obtained from the counts.
        counts = np.bincount(y)
        probs = counts / len(y)
        log_probs = [(np.log(p_i)) for p_i in probs if p_i > 0]
        return log_probs

    @staticmethod
    def _count_state_transitions(
        *, X: Union[np.ndarray, pd.Series]
    ) -> tuple[np.ndarray]:
        """This is used to count the occurrences of transitions.
        i.e calculate the counts of A_hat and Pi_hat.

        Returns:
            (A_hat, Pi_hat)
        """
        # Init params
        A_hat, Pi_hat = MultiNomial_NB._init_A_and_Pi(X=X)

        # To calculate the Pi_hat, we need to count the number of times
        # the initial state was `i` divided by the number of state sequences.
        # Pi_hat = (count(state = i) / N)
        # A_hat: count of the number of times we transitioned from the prev state `i`
        # to the current state `j` divided by the count of the prev state `i`.
        # i.e. A_hat = ( count(state_i to state_j) / (count(state_i)) )
        tokenized_documents = MultiNomial_NB._tokenize_document(X=X)
        for tokenized_doc in tokenized_documents:
            prev_token = None
            for token in tokenized_doc:
                if prev_token is None:
                    Pi_hat[token] += 1
                else:
                    A_hat[prev_token, token]
        return (A_hat, Pi_hat)

    @staticmethod
    def _calculate_log_likelihoods(
        *, X: Union[np.ndarray, pd.Series]
    ) -> tuple[np.ndarray]:
        """This is used to calculate the log of the class conditional
        probability given a specific class label.

        Returns:
            (log(A_hat), log(Pi_hat))
        """
        A_hat, Pi_hat = MultiNomial_NB._count_state_transitions(X=X)
        # Calculate the probabilities
        A_hat /= A_hat.sum(axis=1, keepdims=True)  # OR A_hat/ A_hat.shape[0]
        Pi_hat /= Pi_hat.sum(axis=0)
        return (np.log(A_hat), np.log(Pi_hat))

    def predict(
        self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series]
    ) -> None:
        return

In [123]:
m_nb = MultiNomial_NB()
m_nb

MultiNomial_NB()

In [71]:
df = data.sample(n=20, random_state=1).copy()

df.head()

Unnamed: 0,text,label
301,And yet it need not be that object hid,0
1826,Had checked the pace,1
669,The rare and radiant flowers of song,0
737,And that has made all the difference,1
2053,The upper shelf the tin box Thats the one,1


In [124]:
A_hat, Pi_hat = m_nb._calculate_log_likelihoods(X=df["text"])

Pi_hat

array([-5.45532112, -3.15273602, -4.76217393, -4.06902675, -4.76217393,
       -4.35670883, -4.35670883, -4.06902675, -4.76217393, -4.76217393,
       -4.06902675, -4.76217393, -2.97041447, -4.76217393, -4.76217393,
       -4.76217393, -4.76217393, -3.8458832 , -4.76217393, -4.76217393,
       -4.76217393, -4.76217393, -4.76217393, -4.76217393, -4.76217393,
       -4.76217393, -4.76217393, -4.06902675, -4.76217393, -4.76217393,
       -4.76217393, -4.76217393, -4.76217393, -4.76217393, -4.06902675,
       -4.76217393, -4.76217393, -4.76217393, -4.76217393, -4.76217393,
       -4.76217393, -4.76217393, -4.76217393, -4.76217393, -4.76217393,
       -4.76217393, -4.76217393, -4.76217393, -4.76217393, -4.76217393,
       -4.35670883, -4.76217393, -4.76217393, -4.35670883, -3.8458832 ,
       -4.76217393, -4.76217393, -4.76217393, -4.76217393, -4.35670883,
       -4.76217393, -4.76217393, -4.76217393, -4.76217393, -4.76217393,
       -4.76217393, -4.76217393, -4.76217393, -4.76217393, -4.76

In [126]:
A_hat, Pi_hat = m_nb._count_state_transitions(X=df["text"])
np.log(Pi_hat / 234)

array([-5.45532112, -3.15273602, -4.76217393, -4.06902675, -4.76217393,
       -4.35670883, -4.35670883, -4.06902675, -4.76217393, -4.76217393,
       -4.06902675, -4.76217393, -2.97041447, -4.76217393, -4.76217393,
       -4.76217393, -4.76217393, -3.8458832 , -4.76217393, -4.76217393,
       -4.76217393, -4.76217393, -4.76217393, -4.76217393, -4.76217393,
       -4.76217393, -4.76217393, -4.06902675, -4.76217393, -4.76217393,
       -4.76217393, -4.76217393, -4.76217393, -4.76217393, -4.06902675,
       -4.76217393, -4.76217393, -4.76217393, -4.76217393, -4.76217393,
       -4.76217393, -4.76217393, -4.76217393, -4.76217393, -4.76217393,
       -4.76217393, -4.76217393, -4.76217393, -4.76217393, -4.76217393,
       -4.35670883, -4.76217393, -4.76217393, -4.35670883, -3.8458832 ,
       -4.76217393, -4.76217393, -4.76217393, -4.76217393, -4.35670883,
       -4.76217393, -4.76217393, -4.76217393, -4.76217393, -4.76217393,
       -4.76217393, -4.76217393, -4.76217393, -4.76217393, -4.76

In [118]:
Pi_hat.sum()

234.0

234.0

In [81]:
14 / 20

0.7