# Text Classification Using Probalistic Models

* Markok Models

In [1]:
# Standard imports
import numpy as np
from pprint import pprint
import pandas as pd

import spacy

# Sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin


# Custom imports
from src.text_summarizer import Tokenizer


# Built-in library
import itertools
import re
import json
from typing import Union, Optional, Any
from pathlib import Path
import logging
import warnings

warnings.filterwarnings("ignore")

# pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Black code formatter (Optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

### Download The Data.

In [2]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt

File ‘edgar_allan_poe.txt’ already there; not retrieving.



In [3]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt

File ‘robert_frost.txt’ already there; not retrieving.



In [4]:
def extract_text_data(*, filepath: Path, label: int) -> pd.DataFrame:
    """This loads the text data, assigns a label and returns a DF."""
    with open(filepath, "r") as file:
        # Remove empty lines: using len(line) > 5
        data = [(line.strip(), label) for line in file.readlines() if len(line) > 5]
    # Convert to DF
    df = pd.DataFrame(data=data, columns=["text", "label"])
    return df


def remove_punctuations(text: str) -> str:
    import string

    """This returns the text without punctuations"""
    cleaned_text = re.compile(pattern=f"[{re.escape(string.punctuation)}]").sub(
        repl="", string=str(text)
    )
    return cleaned_text

In [5]:
fp = "edgar_allan_poe.txt"
fp1 = "robert_frost.txt"
label_0, label_1 = 0, 1

edgar_allan_poe = extract_text_data(filepath=fp, label=label_0)
robert_frost = extract_text_data(filepath=fp1, label=label_1)

edgar_allan_poe.head()

Unnamed: 0,text,label
0,LO! Death hath rear'd himself a throne,0
1,"In a strange city, all alone,",0
2,Far down within the dim west,0
3,"Where the good, and the bad, and the worst, and the best,",0
4,Have gone to their eternal rest.,0


In [6]:
robert_frost.head()

Unnamed: 0,text,label
0,"Two roads diverged in a yellow wood,",1
1,And sorry I could not travel both,1
2,"And be one traveler, long I stood",1
3,And looked down one as far as I could,1
4,To where it bent in the undergrowth;,1


In [7]:
RANDOM_STATE = 2
TEST_SIZE = 0.1

data = pd.concat([edgar_allan_poe, robert_frost], axis="rows").reset_index(drop=True)
data.sample(n=10, random_state=RANDOM_STATE)

Unnamed: 0,text,label
1703,Not bluebells gracing a tunnel mouth-,1
1275,Of door and headboard. Where it wants to get,1
294,Hath ever told- or is it of a thought,0
551,To the Lethean peace of the skies-,0
426,"Then desolately fall,",0
662,"That from new fountains overflow,",0
2055,'What's this?',1
611,Of its own fervour - what had o'er it power.,0
1303,"The way a man with one leg and a crutch,",1
1641,'How shall we?',1


In [8]:
# Remove punctuations
data = data.assign(text=data["text"].apply(remove_punctuations))

data.sample(n=10, random_state=RANDOM_STATE)

Unnamed: 0,text,label
1703,Not bluebells gracing a tunnel mouth,1
1275,Of door and headboard Where it wants to get,1
294,Hath ever told or is it of a thought,0
551,To the Lethean peace of the skies,0
426,Then desolately fall,0
662,That from new fountains overflow,0
2055,Whats this,1
611,Of its own fervour what had oer it power,0
1303,The way a man with one leg and a crutch,1
1641,How shall we,1


### Split The Data Into Train And Validation Sets

In [9]:
X = data["text"]
y = data["label"]

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

X_train.shape, X_valid.shape

((1936,), (216,))

### Create Bag of Words (word2idx)

In [10]:
# Add an index for unknown words. This will be used if the word
# is present in the validation set but NOT in the train set.
word2idx = {"<unk>": 0}
idx = 1  # Initialize the index

# Populate word2idx using the train set. Tokenize each sentence,
# if the token isn't in word2idx, add it and increment the idx.
for txt_ in X_train:
    tokenizer = Tokenizer()
    tokenized_text = tokenizer(doc=txt_)
    for token in tokenized_text:
        if token not in word2idx:
            word2idx[token] = idx
            idx += 1

In [11]:
# Slice the dictionary
dict(itertools.islice(word2idx.items(), 10))

{'<unk>': 0,
 'in': 1,
 'the': 2,
 'realms': 3,
 'of': 4,
 'boreal': 5,
 'pole': 6,
 'i': 7,
 'had': 8,
 'a': 9}

### Vectorize The Train And Validation Set

In [12]:
def vectorize_text(*, word_2_idx: dict, data: pd.Series) -> list[list[int]]:
    """This is used to convert text to vectors."""
    vectorized_doc = []
    data = data.copy()

    for txt_ in data:
        tokenizer = Tokenizer()
        tokenized_text = tokenizer(doc=txt_)
        # Extract the index from the dict. i.e convert each token to int
        sent_idx = [word_2_idx.get(token, 0) for token in tokenized_text]
        vectorized_doc.append(sent_idx)
    return vectorized_doc

In [13]:
#  Vectorize the data
X_train_vec = vectorize_text(word_2_idx=word2idx, data=X_train)
X_valid_vec = vectorize_text(word_2_idx=word2idx, data=X_valid)

X_train_vec[:4]

[[1, 2, 3, 4, 2, 5, 6],
 [7, 8, 9, 10, 4, 11, 12, 13],
 [14, 15, 16],
 [17, 18, 19, 20]]

### Initialize State Transition Matrix (A) and Initial State Distribution (Pi)

1. **A**: This is a matrix (2-D array) that's used to store the probability that a state at time **`t`** is **`j`**, given that the state at time **`t-1`** was **`i`**. i.e the probability of a **state** given the **previous state**.

$$
A_{ij} = p(s_{t} = j | s_{t-1} = i) 
$$

>Estimated **`A`** is the number of times we transition from state **`i`** to state **`j`** divided by the total number of times we were in state **`i`**.

$$
\hat{A}_{ij} = \frac{count(i \rightarrow j)}{count(i)}
$$

2. **Pi**: This is a vector that's used to store the probability of the **inital state** in a **sequence**.

$$
Pi_{i} = p(s_{1} = i) 
$$

>Estimated **`Pi`** is the number of times the sequence started at state **`i`** divided by the number of sequences in the dataset, **`N`**.

$$
\hat{Pi}_{i} = \frac{count(s_{1} = i)}{N}
$$

In [14]:
# Since we have 2 classes (edgar_allan_poe, robert_frost),
# we need to build 2 models. i.e A_0, Pi_0 and A_1, Pi_1
V = len(word2idx)  # Vocabulary or number of states

A_0 = np.ones((V, V))  # Matrix with add-one smoothering
Pi_0 = np.ones(V)  # Vector with add-one smoothering

A_1 = np.ones((V, V))  # Matrix with add-one smoothering
Pi_1 = np.ones(V)  # Vector with add-one smoothering

In [29]:
def compute_counts(
    *, vectorize_doc: list[int], A: np.ndarray, Pi: np.ndarray
) -> tuple[np.ndarray, np.ndarray]:
    """This is used to populate A and Pi, normalize the values and
    calculate log probabilities. i.e count the number of occurrences
    of each state and divide by the total number of posible occurrences
    and find the log probability to prevent overflow error.

    Params:
        vectorize_doc: The vectorized doc. i.e List of int.
        A: The state transition matrix.
        Pi: The initial state distibution.

    Returns:
        (log_A, log_Pi): Tuple containing the log prob of A and Pi.
    """
    for tokenized_doc in vectorize_doc:
        prev_idx = None  # previous state/first word
        for idx in tokenized_doc:
            # If there's no prev state/word. i.e it's the first word.
            if prev_idx is None:
                Pi[idx] += 1
            else:
                # A prev word exists, count a transition
                A[prev_idx, idx] += 1

            # Update the prev idx with the current idx
            prev_idx = idx

    return (A, Pi)

In [30]:
# 1st model (edgar_allan_poe)
A_0, Pi_0 = compute_counts(
    vectorize_doc=[_data for _data, y in zip(X_train_vec, y_train) if y == 0],
    A=A_0,
    Pi=Pi_0,
)

# 2nd model (robert_frost)
A_1, Pi_1 = compute_counts(
    vectorize_doc=[_data for _data, y in zip(X_train_vec, y_train) if y == 1],
    A=A_1,
    Pi=Pi_1,
)

# Normalize i.e calculate probabilities for 1st model
A_0 /= A_0.sum(axis=1, keepdims=True)  # Returns in 2-D array
Pi_0 /= Pi_0.sum()  # Returns in 1-D array

A_0[:5]

array([[3.60750361e-04, 3.60750361e-04, 3.60750361e-04, ...,
        3.60750361e-04, 3.60750361e-04, 3.60750361e-04],
       [3.37812730e-06, 3.37812730e-06, 3.49639554e-01, ...,
        3.37812730e-06, 3.37812730e-06, 3.37812730e-06],
       [1.13941156e-06, 1.13941156e-06, 1.13941156e-06, ...,
        1.13941156e-06, 1.13941156e-06, 1.13941156e-06],
       [1.20163422e-04, 1.20163422e-04, 1.20163422e-04, ...,
        1.20163422e-04, 1.20163422e-04, 1.20163422e-04],
       [2.01263938e-06, 2.01263938e-06, 1.59819668e-01, ...,
        2.01263938e-06, 2.01263938e-06, 2.01263938e-06]])

In [17]:
# Normalize i.e calculate probabilities for 2nd model
A_1 /= A_1.sum(axis=1, keepdims=True)  # Returns in 2-D array
Pi_1 /= Pi_1.sum()  # Returns in 1-D array

A_1[:5]

array([[0.00036075, 0.00036075, 0.00036075, ..., 0.00036075, 0.00036075,
        0.00036075],
       [0.00034165, 0.00034165, 0.01469081, ..., 0.00034165, 0.00034165,
        0.00068329],
       [0.00030628, 0.00030628, 0.00030628, ..., 0.00030628, 0.00030628,
        0.00030628],
       [0.00036075, 0.00036075, 0.00036075, ..., 0.00036075, 0.00036075,
        0.00036075],
       [0.00033852, 0.00033852, 0.00947867, ..., 0.00033852, 0.00033852,
        0.00033852]])

In [18]:
# Find log A and pi since we don't need the actual probs
log_A_0 = np.log(A_0)
log_Pi_0 = np.log(Pi_0)

log_A_1 = np.log(A_1)
log_Pi_1 = np.log(Pi_1)

In [19]:
log_A_0[:5]

array([[-7.92732436, -7.92732436, -7.92732436, ..., -7.92732436,
        -7.92732436, -7.92732436],
       [-7.96346007, -7.96346007, -4.35254215, ..., -7.96346007,
        -7.96346007, -7.96346007],
       [-8.02551639, -8.02551639, -8.02551639, ..., -8.02551639,
        -8.02551639, -8.02551639],
       [-7.9280456 , -7.9280456 , -7.9280456 , ..., -7.9280456 ,
        -7.9280456 , -7.9280456 ],
       [-7.98616486, -7.98616486, -4.65396035, ..., -7.98616486,
        -7.98616486, -7.98616486]])

In [20]:
# Compute priors
count_0 = sum(y == 0 for y in y_train)
count_1 = sum(y == 1 for y in y_train)
total = len(y_train)
p_0 = count_0 / total
p_1 = count_1 / total
print(f"Priors: \np_0: {p_0}, p_1: {p_1}\n\n")

log_p_0 = np.log(p_0)
log_p_1 = np.log(p_1)

Priors: 
p_0: 0.3347107438016529, p_1: 0.6652892561983471




### Comment

* Since the priors are are not uniformly distributed. i.e **prob** of **`p_0`** and **`p_1`** are not equal, we can't use the Maximum Likelihood equation which is:

$$
log(p(class = k| input)) = k^*
$$


$$
k^* = argmax_{k} (log(p(input | class = k))
$$

* **Maximum a Posteriori (MAP)** which includes the priors will be used instead. This is the `argmax` of the probability of class = `k`, given the `input` over all classes `k`
* This means that if we're trying to predict the class (label) `k`, the probability with the highest value belongs to class `k` since each model was trained using only inputs corresponding to the desired class.

$$
k^* = argmax_{k} (log(p(input | class = k)) + log(p(class = k)))
$$

#### Note:
>`k` is the ***class*** (or label). In this example, `k=0` or `k=1`


### Putting It All Together

* Build a `Text Classifier` that has a similar API to sklearn. i.e it has a fit (for model training) and predict for making predictions.

In [21]:
class MarkovClassifier:
    """This classifier uses MArkov model for classifiication.
    It's trained using two models. i.e for 2 classes (labels)."""

    def __init__(self, log_A_s: list, log_Pi_s: list, log_priors: list) -> None:
        self.log_A_s = log_A_s
        self.log_Pi_s = log_Pi_s
        self.log_priors = log_priors
        self.K = len(log_priors)

    def _compute_log_likelihood(self, input_: list[int], class_: int) -> None:
        """This returns the log of the probabilities."""
        # Extract the log of A and Pi for the given class (label)
        log_A = self.log_A_s[class_]
        log_Pi = self.log_Pi_s[class_]

        # Initialize variables
        prev_idx = None
        log_prob = 0

        for idx in input_:
            # If it's the first token, replace
            # it with the probability from log_Pi.
            if prev_idx is None:
                log_prob += log_Pi[idx]

            # replace it with the probability from log_A.
            else:
                log_prob += log_A[prev_idx, idx]

            # Update the value (for the next iteration)
            prev_idx = idx
        return log_prob

    def predict(self, inputs: list[list[int]]) -> list[int]:
        """This is used to make predictions using the trained Markov model."""
        predictions = np.zeros(len(inputs))  # Initialize
        # Make predictions for every sentence in inputs
        for idx, input_ in enumerate(inputs):
            posteriors = [
                self._compute_log_likelihood(input_=input_, class_=class_)
                + self.log_priors[class_]
                for class_ in range(self.K)
            ]
            pred = np.argmax(posteriors)
            predictions[idx] = pred
        return predictions

In [22]:
log_A_s = [log_A_0, log_A_1]
log_Pi_s = [log_Pi_0, log_Pi_1]
log_priors = [log_p_0, log_p_1]

clf = MarkovClassifier(log_A_s=log_A_s, log_Pi_s=log_Pi_s, log_priors=log_priors)

In [23]:
y_pred_train = clf.predict(inputs=X_train_vec)
y_pred_train

array([0., 1., 1., ..., 0., 1., 1.])

In [24]:
# Calculate the accuracy
np.mean(y_pred_train == y_train)

0.9963842975206612

In [25]:
y_pred_valid = clf.predict(inputs=X_valid_vec)

# Calculate the accuracy
np.mean(y_pred_valid == y_valid)

0.8564814814814815

### Comment

* The classes are imbalanced (p0 = 0.34) and (p1 = 0.66) so using **`accuracy`** might not be the best evaluation metric.
* **`ROCAUC`** and **`F1 score`** are better metrics

In [26]:
from sklearn import metrics

# Training Set
confusion_matrix_train = metrics.confusion_matrix(y_true=y_train, y_pred=y_pred_train)
confusion_matrix_train

array([[ 641,    7],
       [   0, 1288]])

In [27]:
# Validation Set
confusion_matrix_valid = metrics.confusion_matrix(y_true=y_valid, y_pred=y_pred_valid)
confusion_matrix_valid

array([[ 44,  26],
       [  5, 141]])

In [28]:
f1_score_train = metrics.f1_score(y_true=y_train, y_pred=y_pred_train)
f1_score_valid = metrics.f1_score(y_true=y_valid, y_pred=y_pred_valid)

f1_score_train, f1_score_valid

(0.997289972899729, 0.9009584664536742)

In [86]:
import numpy as np


class MarkovClassifier:
    """This classifier uses MArkov model for classifiication.\n
    It's trained using two models. i.e for 2 classes (labels).

    Params:
        word2idx (dict): A dictionary containing the vocabulary. i.e Bag of Words.

    Returns:
        None
    """

    def __init__(self, word2idx: dict) -> None:
        self.log_A_s = None
        self.log_Pi_s = None
        self.log_priors = None
        self.K = None
        self.word2idx = word2idx

    def __repr__(self) -> str:
        return f"{__class__.__name__}()"

    def _initialize_A_n_Pi(self) -> tuple[np.ndarray, np.ndarray]:
        """This is used to initialise the matrices."""
        # Since we have 2 classes (edgar_allan_poe, robert_frost),
        # we need to build 2 models. i.e A_0, Pi_0 and A_1, Pi_1
        V = len(self.word2idx)  # Vocabulary or number of states

        A = np.ones((V, V))  # Matrix with add-one smoothering
        Pi = np.ones(V)  # Vector with add-one smoothering
        return (A, Pi)

    def _compute_counts(
        self, X: list[int], A: np.ndarray, Pi: np.ndarray
    ) -> tuple[np.ndarray, np.ndarray]:
        """This is used to populate A and Pi, normalize the values and
        calculate log probabilities. i.e count the number of occurrences
        of each state and divide by the total number of posible occurrences
        and find the log probability to prevent overflow error.

        Params:
            X: The vectorized doc. i.e List of int.
            A: The state transition matrix.
            Pi: The initial state distibution.

        Returns:
            (log_A, log_Pi): Tuple containing the log prob of A and Pi.
        """
        for tokenized_doc in X:
            prev_idx = None  # previous state/first word
            for idx in tokenized_doc:
                # If there's no prev state/word. i.e it's the first word.
                if prev_idx is None:
                    Pi[idx] += 1
                else:
                    # A prev word exists, count a transition
                    A[prev_idx, idx] += 1

                # Update the prev idx with the current idx
                prev_idx = idx
        return (A, Pi)

    def _calculate_probabilities(
        self, A: np.ndarray, Pi: np.ndarray
    ) -> tuple[np.ndarray, np.ndarray]:
        """This is used to normalize the arrays. i.e calculate
        log probabilities."""
        A /= A.sum(axis=1, keepdims=True)  # Returns in 2-D array
        Pi /= Pi.sum()  # Returns in 1-D array
        return (np.log(A), np.log(Pi))

    def _compute_log_priors(self, y: np.ndarray) -> tuple[float, float]:
        count_0 = sum(_y == 0 for _y in y)
        count_1 = sum(_y == 1 for _y in y)
        total = len(y)
        p_0 = count_0 / total
        p_1 = count_1 / total
        log_p_0, log_p_1 = np.log(p_0), np.log(p_1)
        return (log_p_0, log_p_1)

    def _calculate_log_probs_n_priors(self, X: np.ndarray, y: np.ndarray) -> None:
        """This calculates the log probabilities for the two (2) models (classes) and log priors.
        It returns (log_A_0, log_A_1), (log_Pi_0, and log_Pi_1) and (log_p_0, log_p_1)."""
        # 1st model
        A_0, Pi_0 = self._initialize_A_n_Pi()
        A_0, Pi_0 = self._compute_counts(
            X=[_data for _data, _y in zip(X, y) if _y == 0],
            A=A_0,
            Pi=Pi_0,
        )
        log_A_0, log_Pi_0 = self._calculate_probabilities(A_0, Pi_0)

        # 2nd model
        A_1, Pi_1 = self._initialize_A_n_Pi()
        A_1, Pi_1 = self._compute_counts(
            X=[_data for _data, _y in zip(X, y) if _y == 1],
            A=A_1,
            Pi=Pi_1,
        )
        log_A_1, log_Pi_1 = self._calculate_probabilities(A_1, Pi_1)
        log_p_0, log_p_1 = self._compute_log_priors(y)
        self.log_A_s = [log_A_0, log_A_1]
        self.log_Pi_s = [log_Pi_0, log_Pi_1]
        self.log_priors = [log_p_0, log_p_1]
        self.K = len(self.log_priors)

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        # log_A_s, log_Pi_s, log_priors = self._calculate_log_probs_n_priors(X, y)
        self._calculate_log_probs_n_priors(X, y)
        return self

    def _compute_log_likelihood(self, input_: list[int], class_: int) -> None:
        """This returns the log of the probabilities."""
        # Extract the log of A and Pi for the given class (label)
        log_A = self.log_A_s[class_]
        log_Pi = self.log_Pi_s[class_]

        # Initialize variables
        prev_idx = None
        log_prob = 0

        for idx in input_:
            # If it's the first token, replace
            # it with the probability from log_Pi.
            if prev_idx is None:
                log_prob += log_Pi[idx]

            # replace it with the probability from log_A.
            else:
                log_prob += log_A[prev_idx, idx]

            # Update the value (for the next iteration)
            prev_idx = idx
        return log_prob

    def predict(self, X: list[list[int]]) -> np.ndarray:
        """This is used to make predictions using the trained Markov model."""
        # Initialize
        predictions = np.zeros(len(X))
        # Make predictions for every sentence in X
        for idx, input_ in enumerate(X):
            posteriors = [
                self._compute_log_likelihood(input_=input_, class_=class_)
                + self.log_priors[class_]
                for class_ in range(self.K)
            ]
            pred = np.argmax(posteriors)
            predictions[idx] = pred
        return predictions

In [87]:
clf = MarkovClassifier(word2idx=word2idx)
clf.fit(X=X_train_vec, y=y_train)

MarkovClassifier()

In [88]:
y_pred_train = clf.predict(X=X_train_vec)
# Calculate the accuracy
np.mean(y_pred_train == y_train)

0.9963842975206612