# Text Classification Using Probalistic Models

* Markok Models

In [1]:
# Standard imports
import numpy as np
from pprint import pprint
import pandas as pd

import spacy

# Sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin


# Custom imports
from src.text_summarizer import Tokenizer


# Built-in library
import itertools
import re
import json
from typing import Union, Optional, Any
from pathlib import Path
import logging
import warnings

warnings.filterwarnings("ignore")

# pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Black code formatter (Optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

### Download The Data.

In [2]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt

File ‘edgar_allan_poe.txt’ already there; not retrieving.



In [3]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt

File ‘robert_frost.txt’ already there; not retrieving.



In [4]:
def extract_text_data(*, filepath: Path, label: int) -> pd.DataFrame:
    """This loads the text data, assigns a label and returns a DF."""
    with open(filepath, "r") as file:
        # Remove empty lines: using len(line) > 5
        data = [(line.strip(), label) for line in file.readlines() if len(line) > 5]
    # Convert to DF
    df = pd.DataFrame(data=data, columns=["text", "label"])
    return df


def remove_punctuations(text: str) -> str:
    import string

    """This returns the text without punctuations"""
    cleaned_text = re.compile(pattern=f"[{re.escape(string.punctuation)}]").sub(
        repl="", string=str(text)
    )
    return cleaned_text

In [5]:
fp = "edgar_allan_poe.txt"
fp1 = "robert_frost.txt"
label_0, label_1 = 0, 1

edgar_allan_poe = extract_text_data(filepath=fp, label=label_0)
robert_frost = extract_text_data(filepath=fp1, label=label_1)

edgar_allan_poe.head()

Unnamed: 0,text,label
0,LO! Death hath rear'd himself a throne,0
1,"In a strange city, all alone,",0
2,Far down within the dim west,0
3,"Where the good, and the bad, and the worst, and the best,",0
4,Have gone to their eternal rest.,0


In [6]:
robert_frost.head()

Unnamed: 0,text,label
0,"Two roads diverged in a yellow wood,",1
1,And sorry I could not travel both,1
2,"And be one traveler, long I stood",1
3,And looked down one as far as I could,1
4,To where it bent in the undergrowth;,1


In [7]:
RANDOM_STATE = 2

data = pd.concat([edgar_allan_poe, robert_frost], axis="rows").reset_index(drop=True)
data.sample(n=10, random_state=RANDOM_STATE)

Unnamed: 0,text,label
1703,Not bluebells gracing a tunnel mouth-,1
1275,Of door and headboard. Where it wants to get,1
294,Hath ever told- or is it of a thought,0
551,To the Lethean peace of the skies-,0
426,"Then desolately fall,",0
662,"That from new fountains overflow,",0
2055,'What's this?',1
611,Of its own fervour - what had o'er it power.,0
1303,"The way a man with one leg and a crutch,",1
1641,'How shall we?',1


In [8]:
# Remove punctuations
data = data.assign(text=data["text"].apply(remove_punctuations))

data.sample(n=10, random_state=RANDOM_STATE)

Unnamed: 0,text,label
1703,Not bluebells gracing a tunnel mouth,1
1275,Of door and headboard Where it wants to get,1
294,Hath ever told or is it of a thought,0
551,To the Lethean peace of the skies,0
426,Then desolately fall,0
662,That from new fountains overflow,0
2055,Whats this,1
611,Of its own fervour what had oer it power,0
1303,The way a man with one leg and a crutch,1
1641,How shall we,1


### Split The Data Into Train And Validation Sets

In [9]:
X = data["text"]
y = data["label"]

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.33, random_state=RANDOM_STATE
)

X_train.shape, X_valid.shape

((1441,), (711,))

### Create Bag of Words (word2idx)

In [14]:
# Add an index for unknown words. This will be used if the word
# is present in the validation set but NOT in the train set.
word2idx = {"<unk>": 0}
idx = 1  # Initialize the index

# Populate word2idx using the train set. Tokenize each sentence,
# if the token isn't in word2idx, add it and increment the idx.
for txt_ in X_train:
    tokenizer = Tokenizer()
    tokenized_text = tokenizer(doc=txt_)
    for token in tokenized_text:
        if token not in word2idx:
            word2idx[token] = idx
            idx += 1

In [15]:
word2idx

{'<unk>': 0,
 'there': 1,
 'is': 2,
 'our': 3,
 'wildest': 4,
 'mount': 5,
 'a': 6,
 'headless': 7,
 'horse': 8,
 'i': 9,
 'guess': 10,
 'he': 11,
 'found': 12,
 'got': 13,
 'more': 14,
 'out': 15,
 'of': 16,
 'me': 17,
 'it': 18,
 'lifts': 19,
 'gaunt': 20,
 'luxuriating': 21,
 'beast': 22,
 'by': 23,
 'the': 24,
 'door': 25,
 'legended': 26,
 'tomb': 27,
 'thus': 28,
 'pacified': 29,
 'psyche': 30,
 'and': 31,
 'kissed': 32,
 'her': 33,
 'happy': 34,
 'flowers': 35,
 'repining': 36,
 'trees': 37,
 'since': 38,
 'was': 39,
 'no': 40,
 'other': 41,
 'way': 42,
 'to': 43,
 'look': 44,
 'thrown': 45,
 'away': 46,
 'both': 47,
 'united': 48,
 'strengths': 49,
 'do': 50,
 'stand': 51,
 'together': 52,
 'on': 53,
 'craters': 54,
 'verge': 55,
 'with': 56,
 'whose': 57,
 'vast': 58,
 'wheels': 59,
 'think': 60,
 'know': 61,
 'country': 62,
 'now': 63,
 'mother': 64,
 'yes': 65,
 'we': 66,
 'could': 67,
 'too': 68,
 'son': 69,
 'tell': 70,
 'truth': 71,
 'for': 72,
 'once': 73,
 'receipted': 

### Vectorize The Train And Validation Set

In [19]:
def vectorize_text(*, word_2_idx: dict, data: pd.Series) -> list[list[int]]:
    """This is used to convert text to vectors."""
    vectorized_doc = []
    data = data.copy()

    for txt_ in data:
        tokenizer = Tokenizer()
        tokenized_text = tokenizer(doc=txt_)
        # Extract the index from the dict. i.e convert each token to int
        sent_idx = [word_2_idx.get(token, 0) for token in tokenized_text]
        vectorized_doc.append(sent_idx)
    return vectorized_doc

In [23]:
#  Vectorize the data
X_train_vec = vectorize_text(word_2_idx=word2idx, data=X_train)
X_valid_vec = vectorize_text(word_2_idx=word2idx, data=X_valid)

X_train_vec[:4]

[[1, 2, 3, 4, 5, 6, 7, 8],
 [9, 10, 11, 12, 11, 13, 14, 15, 16, 17],
 [18, 19, 6, 20, 21, 22],
 [23, 24, 25, 16, 6, 26, 27]]

### Initialize State Transition Matrix (A) and Initial State Distribution (Pi)

1. **A**: This is a matrix (2-D array) that's used to store the probability that a state at time ***`t`*** is ***`j`***, given that the state at time ***`t-1`*** was ***`i`***. i.e the probability of a **state** given the **previous state**.

$$
A_{ij} = p(s_{t} = j | s_{t-1} = i) 
$$

>Estimated **`A`** is the number of times we transition from state ***`i`*** to state ***`j`*** divided by the total number of times we were in state ***`i`***.

$$
\hat{A}_{ij} = \frac{count(i \rightarrow j)}{count(i)}
$$

2. **Pi**: This is a vector that's used to store the probability of the **inital state** in a **sequence**.

$$
Pi_{i} = p(s_{1} = i) 
$$

>Estimated **`Pi`** is the number of times the sequence started at state ***`i`*** divided by the number of sequences in the dataset, ***`N`***.

$$
\hat{Pi}_{i} = \frac{count(s_{1} = i)}{N}
$$

[[239, 0, 0, 6, 0, 0],
 [16, 25, 31, 941, 94, 18, 0, 43, 106],
 [483, 615, 1916, 223, 2, 18, 16, 6, 583],
 [43, 24, 2049, 2050, 16, 24, 394],
 [301, 0, 2037],
 [103, 78, 788, 0, 0],
 [350, 175, 86],
 [16, 226, 284, 0, 413, 350, 127, 1126, 18, 1678],
 [24, 42, 6, 245, 56, 218, 0, 31, 6, 0],
 [190, 1047, 66],
 [31, 94, 175, 170],
 [11, 0, 39, 939, 0, 46],
 [94, 24, 1627, 1930, 1377, 24, 2052],
 [353, 89, 1552, 354, 76, 24, 355],
 [94, 524, 896, 104, 6, 181],
 [24, 1198, 478, 1869, 24, 1089, 16, 0],
 [329, 151, 191, 50, 128, 151, 66, 516, 761, 16, 112],
 [31, 227, 1120, 1477, 35, 424, 463],
 [23, 415, 11, 0, 31, 0],
 [198, 9, 61, 63, 86, 1374, 1375, 16, 1232],
 [478, 2036, 246, 2037, 78, 412, 31, 435],
 [478, 2116, 6, 0, 23],
 [309, 6, 313, 0, 16, 744],
 [94, 40, 1607, 0, 43, 1938, 1367, 53, 0],
 [1014, 889, 239, 912, 0, 78, 33, 0],
 [31, 9, 873, 413, 18, 39, 928, 929],
 [151, 83, 17, 272, 18, 53, 2333],
 [1455, 9, 1622, 24, 135, 72, 1023, 531],
 [345, 253, 309, 24, 2017, 16, 24, 919],
 [