## Train FastText Model (Use this approach for large datasets)

In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,torch,lightning --conda

Python implementation: CPython
Python version       : 3.11.8
IPython version      : 8.22.2

numpy    : 1.26.4
pandas   : 2.2.1
polars   : 0.20.18
torch    : 2.2.2
lightning: 2.2.1

conda environment: torch_p11



In [2]:
# Built-in library
from pathlib import Path
import re
import json
from typing import (
    Any,
    Iterable,
    Iterator,
    Literal,
    Optional,
    TypedDict,
    Union,
)
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [3]:
from rich.console import Console
from rich.panel import Panel
from rich.text import Text
from rich.table import Table
from rich import box
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)

console = Console(theme=custom_theme)

In [4]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)

In [5]:
go_up_from_current_directory(go_up=1)

from helper_func.utilities import (
    create_wordcloud,
    fancy_print,
    infer_stopwords,
    tokenize_by_special_chars,
)
from helper_func.custom_pipelines import (
    LsiTransformer,
    TokenizeTransformer,
    DictionaryTransformer,
    TfidfTransformer,
)


# Example usage
console.print("hello, world!")
fancy_print({"message": "Good morning!", "name": "John"})

/Users/neidu/Desktop/Projects/Indicina/Ad_hoc_projects/notebook/Decide-modelling


### Load Data

In [6]:
fp: str = "../data/all_dataset.parquet"

data: pl.DataFrame = pl.read_parquet(fp)
fancy_print(data.shape)
threshold: int = 10
data = data.with_columns(char_length=pl.col("description").str.len_chars())
data = data.filter(pl.col("char_length").ge(threshold))


data.head()

id,date,description,amount,label,tag,char_length
str,date,str,f64,list[str],str,u32
"""uba_2024_407438997900""",2020-04-27,"""N-10062410751/APR 2020 Salary - 241429 from Bauchi""",37771.49,"[""Formal-Income""]","""Formal-Income""",50
"""uba_2024_799102515044""",2021-05-04,"""DAYI SAL PYMT/OMEH SUNDAY O.""",54400.0,"[""Formal-Income""]","""Formal-Income""",28
"""uba_2024_841763787128""",2020-08-21,"""UPAY/DSS AUG 20 SAL""",43831.61,"[""Formal-Income""]","""Formal-Income""",19
"""uba_2024_321090584398""",2020-07-28,"""COP08H1XU314384:NIG ARMY JULY 2020 SAL A""",60341.84,"[""Formal-Income""]","""Formal-Income""",40
"""uba_2024_235875690847""",2020-08-26,"""TNF-NAF INVESTMENTS LTD/NAFIL SG CIV SAL 833""",25000.0,"[""Formal-Income""]","""Formal-Income""",44


In [7]:
data.describe()

statistic,id,date,description,amount,label,tag,char_length
str,str,str,str,f64,f64,str,f64
"""count""","""653273""","""653273""","""653273""",653273.0,653273.0,"""653273""",653273.0
"""null_count""","""0""","""0""","""0""",0.0,0.0,"""0""",0.0
"""mean""",,"""2022-01-28""",,50905.579721,,,49.875204
"""std""",,,,1096000.0,,,19.628963
"""min""","""0020ff5f7266a36c17f3c79001bde9dd785f2eaa90e10e00faf8b3988da0e9e233e45b4565c2a7c3166c4b61346195d27b8c084f74b2fc6c0c341696f518604c""","""1971-05-20""",""" AUG 2021 SAL BO NULGE BYS None""",0.0,,"""Formal-Income""",10.0
"""25%""",,"""2020-12-29""",,2000.0,,,39.0
"""50%""",,"""2021-07-23""",,6000.0,,,49.0
"""75%""",,"""2023-06-26""",,25000.0,,,53.0
"""max""","""uba_2024_999806993212""","""2029-01-23""","""| ADE MICHEAL LALA""",722500000.0,,"""No-Income""",229.0


In [8]:
corpus: list[str] = data["description"].to_list()
corpus[:2]

['N-10062410751/APR 2020 Salary - 241429 from Bauchi',
 'DAYI SAL PYMT/OMEH SUNDAY O.']

In [9]:
from typing import TypedDict
from gensim.models import FastText
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline

<br>

## FastText

- `FastText` is a popular word embedding technique that offers two training methods: 
  - Skip-gram
  - Continuous Bag-of-Words (CBOW).

- Both methods aim to learn vector representations for words that capture semantic and syntactic relationships.

### Key Differences Between Skip-Gram and CBOW

**Prediction Direction**:
- `CBOW`: This model predicts the `target word` based on its surrounding context words. For example, given the context words, CBOW tries to guess the central word.
- `Skip-Gram`: In contrast, Skip-Gram predicts the `context words` based on a given target word. For example, given a specific word, Skip-Gram tries to predict the words that are likely to appear around it.

**Training Speed**:
- `CBOW` is generally faster to train than Skip-Gram because it has a simpler prediction task. It averages the context word vectors to predict the target word.
- `Skip-Gram` can be slower since it predicts multiple context words for each target word, which increases computational complexity.

**Handling Rare Words**:
- `Skip-Gram` performs better with `rare words`. It is less sensitive to overfitting frequent words, allowing it to capture the semantics of less common terms effectively.
- `CBOW` tends to perform better with `frequent words`, as it averages the context, which can smooth out the representation of rare words.

**Use Cases**:
- `Skip-Gram` is often preferred for tasks where `understanding the context of less frequent words is crucial`, such as in semantic similarity tasks, sentiment analysis, and text generation.
- CBOW is suitable for `applications where speed is essential, and the focus is on more common words`, such as in real-time applications.

In [10]:
from dataclasses import dataclass, field
from gensim import utils


@dataclass
class MyIter:
    """
    A class representing an iterator for processing a corpus file.

    Attributes
    ----------
    corpus_file : str
        The path to the corpus file.
    path : str
        The processed path of the corpus file.

    Methods
    -------
    __post_init__()
        Initialize the path attribute after object creation.
    __iter__()
        Iterate over the lines in the corpus file and yield tokenized results.
    """

    corpus_file: str
    path: str = field(default_factory=str, init=False)

    def __post_init__(self) -> None:
        """
        Initialize the path attribute after object creation.
        """
        self.path = self.corpus_file

    def __iter__(self) -> Iterator[list[str]]:
        """
        Iterate over the lines in the corpus file and yield tokenized results.

        Yields
        ------
        list[str]
            A list of tokens for each line in the corpus file.
        """
        with utils.open(self.path, "r", encoding="utf-8") as fin:
            for line in fin:
                yield tokenize_by_special_chars(line, drop_digits=True)

In [11]:
from helper_func.utilities import MyIter


processed_corpus: MyIter = MyIter("corpus.txt")

processed_corpus

MyIter(corpus_file='corpus.txt', path='corpus.txt')

### Model Training

### For Small Corpus

```python
# Train FastText model (Use this approach for small datasets)
model = FastText(**model_config)

```
<hr>

#### For Large Corpus (Recommended)

- The model needs the `total_words` parameter in order to manage the training rate (alpha) correctly, and to give accurate progress estimates.
- The above example relies on an implementation detail: the `build_vocab`() method sets the `corpus_total_words` (and also `corpus_count`) model attributes.

```python
from gensim.test.utils import datapath


# corpus_file = datapath('lee_background.cor')  # absolute path to corpus
model_1 = FastText(vector_size=4, window=3, min_count=1)
model_1.build_vocab(corpus_file=corpus_file)  # scan over corpus to build the vocabulary
total_words = model_1.corpus_total_words  # number of words in the corpus
model_1.train(corpus_file=corpus_file, total_words=total_words, epochs=5)


# OR Using your custom corpus iterator
from gensim.utils import tokenize
from gensim import utils


class MyIter:
    def __iter__(self):
        path = datapath('crime-and-punishment.txt')
        with utils.open(path, 'r', encoding='utf-8') as fin
            for line in fin:
                yield list(tokenize(line))


model_2 = FastText(vector_size=4, window=3, min_count=1)
model_2.build_vocab(corpus_iterable=MyIter())
total_examples = model_2.corpus_count
model_2.train(corpus_iterable=MyIter(), total_examples=total_examples, epochs=5)
```

<hr>

#### Save And Load Model

```python
from gensim.test.utils import get_tmpfile

fname = get_tmpfile("fasttext.model")

# Save model
model.save(fname)

# Load model
model = FastText.load(fname)
```

#### Update The Model

- Once loaded, such models behave identically to those created from scratch. For example, you can continue training the loaded model.
- It's **IMPORTANT**  to call the `build_vocab()` method with `update=Tru`e before the `train()` method when continuing training. Without this call, previously unseen terms will not be added to the vocabulary.

```python
import numpy as np

'computation' in model.wv.key_to_index  # New word, currently out of vocab
# Output: False

old_vector = np.copy(model.wv['computation'])  # Grab the existing vector
new_sentences = [
    ['computer', 'aided', 'design'],
    ['computer', 'science'],
    ['computational', 'complexity'],
    ['military', 'supercomputer'],
    ['central', 'processing', 'unit'],
    ['onboard', 'car', 'computer'],
]

model.build_vocab(new_sentences, update=True)  # Update the vocabulary
model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)

new_vector = model.wv['computation']
np.allclose(old_vector, new_vector, atol=1e-4)  # Vector has changed, model has learnt something
# Output: False

'computation' in model.wv.key_to_index  # Word is still out of vocab
# Output: False

```

In [12]:
class FastTextConfig(TypedDict):
    sentences: list[list[str]] | Iterable[list[str]] | Iterator[list[str]] | None
    vector_size: int  # Dimensionality of the feature vectors.
    min_count: int  # Ignores all words with total frequency lower than this.
    workers: int  # Number of threads to run in parallel.
    sg: int  # Training algorithm: skip-gram if sg=1, otherwise sg=0 for CBOW.


model_config: FastTextConfig = {
    "sentences": None,
    "vector_size": 200,
    "min_count": 5,
    "workers": 4,
    "sg": 1,
}
model_config

{'sentences': None, 'vector_size': 200, 'min_count': 5, 'workers': 4, 'sg': 1}

#### Train Model

In [13]:
import time


# Train FastText model (Use this approach for large datasets)
start_time: float = time.time()

model = FastText(**model_config)
model.build_vocab(corpus_iterable=processed_corpus)
total_examples = model.corpus_count
model.train(corpus_iterable=processed_corpus, total_examples=total_examples, epochs=5)
end_time: float = time.time()

exec_time: float = (end_time - start_time) / 60
fancy_print(f"Training completed in {exec_time:.2f} minutes.")

### Save And Load Model

In [14]:
from gensim.test.utils import get_tmpfile


fname: str = "../models/fasttext/transactions_emb.model"

# Save model
model.save(fname)
fancy_print("Model successfully saved!")

# Load model
model = FastText.load(fname)
fancy_print("Model successfully loaded!")

### [Save Model As KeyVector](https://radimrehurek.com/gensim/models/keyedvectors.html#why-use-keyedvectors-instead-of-a-full-model)

In [15]:
kv_path: str = "../models/fasttext/keyVector/vectors.kv"

word_vectors = model.wv
word_vectors.save(kv_path)
fancy_print("Model successfully saved!")

<br>

### Test The Model

In [16]:
# Get embeddings for in-vocabulary words
word_in_vocab: str = "salary"
embedding_in_vocab: str = model.wv[word_in_vocab]

print(f"{word_in_vocab!r} in vocabulary: {word_in_vocab in model.wv.key_to_index}")
print(f"Embedding for '{word_in_vocab}': {embedding_in_vocab[:2]} ...")
print("=====" * 20)

# Get embeddings for out-of-vocabulary words
word_out_of_vocab = "unSeenWord"
embedding_out_of_vocab = model.wv[word_out_of_vocab]
print(
    f"\n{word_out_of_vocab!r} in vocabulary: {word_out_of_vocab in model.wv.key_to_index}"
)
print(f"Embedding for '{word_out_of_vocab}': {embedding_out_of_vocab[:2]} ...")

'salary' in vocabulary: True
Embedding for 'salary': [-0.5412  0.2769] ...

'unSeenWord' in vocabulary: False
Embedding for 'unSeenWord': [0.0387 0.0574] ...


In [17]:
cosine_similarity([embedding_in_vocab, embedding_out_of_vocab])

array([[1.    , 0.3782],
       [0.3782, 1.    ]], dtype=float32)

In [18]:
from helper_func.custom_pipelines import FastTextTransformer


pipe: Pipeline = Pipeline(
    [
        ("tokenizer", TokenizeTransformer(drop_digits=True)),
        ("embeddings", FastTextTransformer(model=model)),
    ]
)

pipe

In [19]:
text_1: str = "DECEMBER SAL | MTN NG"
text_2: str = "JANUARY SALARY FROM YETS LTD"
text_3: str = "JULY SALARY FROM YETS LTD"
text_4: str = "LAFARGE AFRICA PLC/9200107390230520241451447"


input_corpus: list[str] = [text_1, text_2, text_3, text_4]
result_array: np.ndarray = pipe.fit_transform(input_corpus)

cosine_similarity(result_array)

array([[1.    , 0.7614, 0.7624, 0.6316],
       [0.7614, 1.    , 0.9722, 0.6359],
       [0.7624, 0.9722, 1.    , 0.6268],
       [0.6316, 0.6359, 0.6268, 1.    ]], dtype=float32)

In [20]:
%%time

input_corpus: list[str] = [
    "MAT-ICE MARCH 2020 SAL PYMT",
    "MARCH SALARY/ETUK MICHAEL JAME",
    "AfeMarch/March2024 SalyYY/6491690/  LANG",
    "AfeMayHo/May2024 SalyYY/6539959/    LANG",
    "TRF/May salary : Hodewu /TO OLUWASEGUN HEZEKIAH HODEWU FROM TOP ST JOHNS BRIDGE SCHOOLS ",
    "TRF/Akindele Stephen A/FRM TOP ST JOHNS BRIDGE SCHOOLS TO ADAM ADEGBINDIN- C03 ",
]
result_array: np.ndarray = pipe.fit_transform(input_corpus)

cosine_similarity(result_array)

CPU times: user 3.47 ms, sys: 3.08 ms, total: 6.55 ms
Wall time: 13.6 ms


array([[1.    , 0.7161, 0.8028, 0.7841, 0.7299, 0.5857],
       [0.7161, 1.    , 0.7685, 0.7178, 0.8173, 0.679 ],
       [0.8028, 0.7685, 1.    , 0.9451, 0.7611, 0.6249],
       [0.7841, 0.7178, 0.9451, 1.    , 0.767 , 0.639 ],
       [0.7299, 0.8173, 0.7611, 0.767 , 1.    , 0.9019],
       [0.5857, 0.679 , 0.6249, 0.639 , 0.9019, 1.    ]], dtype=float32)

## Compress the FastText model

In [21]:
# Vocabulary size
vocabulary_size = len(model.wv.key_to_index)

print(f"Vocabulary size: {vocabulary_size:,}")
new_vocab_size: int = int(0.8 * vocabulary_size)
print(f"80% of the vocabulary is: {new_vocab_size:,}")

Vocabulary size: 32,463
80% of the vocabulary is: 25,970


In [22]:
from gensim.models.fasttext import FastTextKeyedVectors
import compress_fasttext

In [23]:
# Load keyed vectors model
big_model = FastTextKeyedVectors.load(kv_path)
N: int = 1  # Scale down the vector size by a factor of N

prev_vector_size: int = model.vector_size
new_vector_size: int = prev_vector_size // N
if not (model.vector_size % new_vector_size) == 0:
    raise ValueError("Cannot scale down vector size by a factor of N")

print(f"{prev_vector_size = }")
print(f"{new_vector_size = }")

prev_vector_size = 200
new_vector_size = 200


In [24]:
small_model = compress_fasttext.prune_ft_freq(
    big_model,
    new_vocab_size=new_vocab_size,
    pq=True,
    qdim=new_vector_size,
)
small_model.save("../models/quant_fasttext/transactions_emb.model")
fancy_print("Model successfully saved!")

In [25]:
# Compare similarity of vectors generated by the two models
cosine_similarity(
    small_model["salary"].reshape(1, -1), model.wv["salary"].reshape(1, -1)
)

array([[0.999]], dtype=float32)

In [26]:
type(small_model)

compress_fasttext.compress.CompressedFastTextKeyedVectors

In [27]:
from compress_fasttext.compress import CompressedFastTextKeyedVectors
from sklearn.base import BaseEstimator, TransformerMixin
from typing import Any, List


class QuantizedFastTextTransformer(BaseEstimator, TransformerMixin):
    """
    A transformer that uses FastText embeddings to convert text documents
    into vector representations.

    Parameters
    ----------
    model : CompressedFastTextKeyedVectors
        The pre-trained compressed FastText model.

    Attributes
    ----------
    model : CompressedFastTextKeyedVectors
        The loaded compressed FastText model.
    """

    def __init__(self, model: CompressedFastTextKeyedVectors) -> None:
        self.model: CompressedFastTextKeyedVectors = model

    def get_embeddings(self, document: List[str]) -> np.ndarray:
        """
        Compute the average embedding for a given document.

        Parameters
        ----------
        document : List[str]
            A list of words representing the document.

        Returns
        -------
        np.ndarray
            The average embedding vector for the document.
            Shape: (embedding_size,)
        """
        word_embeddings: List[np.ndarray] = [self.model[word] for word in document]
        document_embedding: np.ndarray = np.mean(word_embeddings, axis=0)
        return document_embedding

    def fit(self, X: List[str], y: Any | None = None) -> "QuantizedFastTextTransformer":
        """
        Fit the transformer to the data.

        Parameters
        ----------
        X : List[str]
            The input samples, each a string representing a document.
        y : Any | None, optional
            Ignored. This parameter exists only for compatibility with
            sklearn.pipeline.Pipeline.

        Returns
        -------
        QuantizedFastTextTransformer
            The fitted transformer.
        """
        return self

    def transform(self, X: List[List[str]]) -> np.ndarray:
        """
        Transform the input documents into their FastText embeddings.

        Parameters
        ----------
        X : List[List[str]]
            The input samples, each a list of strings representing words in a document.

        Returns
        -------
        np.ndarray
            The FastText embeddings for each input document.
            Shape: (n_samples, embedding_size)
        """
        embeddings: np.ndarray = np.array([self.get_embeddings(doc) for doc in X])
        return embeddings

In [28]:
compr_pipe: Pipeline = Pipeline(
    [
        ("tokenizer", TokenizeTransformer(drop_digits=True)),
        ("embeddings", QuantizedFastTextTransformer(model=small_model)),
    ]
)

compr_pipe

In [29]:
%%time

input_corpus: list[str] = [
    "MAT-ICE MARCH 2020 SAL PYMT",
    "MARCH SALARY/ETUK MICHAEL JAME",
    "AfeMarch/March2024 SalyYY/6491690/  LANG",
    "AfeMayHo/May2024 SalyYY/6539959/    LANG",
    "TRF/May salary : Hodewu /TO OLUWASEGUN HEZEKIAH HODEWU FROM TOP ST JOHNS BRIDGE SCHOOLS ",
    "TRF/Akindele Stephen A/FRM TOP ST JOHNS BRIDGE SCHOOLS TO ADAM ADEGBINDIN- C03 ",
]
result_array: np.ndarray = compr_pipe.fit_transform(input_corpus)

cosine_similarity(result_array)

CPU times: user 4.35 ms, sys: 2.96 ms, total: 7.31 ms
Wall time: 7.56 ms


array([[1.    , 0.722 , 0.8031, 0.7855, 0.7331, 0.5908],
       [0.722 , 1.    , 0.7678, 0.7196, 0.82  , 0.684 ],
       [0.8031, 0.7678, 1.    , 0.9437, 0.7635, 0.6332],
       [0.7855, 0.7196, 0.9437, 1.    , 0.7696, 0.6473],
       [0.7331, 0.82  , 0.7635, 0.7696, 1.    , 0.9032],
       [0.5908, 0.684 , 0.6332, 0.6473, 0.9032, 1.    ]], dtype=float32)