<a href="https://colab.research.google.com/github/dishitasood/google-deepmind-ai-research/blob/main/Preparing_a_Datset_for_Training_an_SLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install "git+https://github.com/google-deepmind/ai-foundations.git@main"

import re # Used for splitting strings on spaces.

# Packages used.
import pandas as pd # For reading the dataset.
import textwrap # For adding linebreaks to paragraphs.

# For providing feedback.
from ai_foundations.feedback.course_1 import slm

### Loading and Tokenization the dataset

In [2]:
africa_galore = pd.read_json(
    "https://storage.googleapis.com/dm-educational/assets/ai_foundations/africa_galore.json"
)

dataset = africa_galore['description']
print(f"Loaded Africa Galore dataset: {len(dataset)} paragraphs")
print(f"\nFirst Paragraph:")
print(textwrap.fill(dataset[0]))

Loaded Africa Galore dataset: 232 paragraphs

First Paragraph:
The Lagos air was thick with humidity, but the energy in the club was
electric. The band launched into a hypnotic Afrobeat groove, the drums
pounding out a complex polyrhythm, the horns blaring a soaring melody,
and the bass laying down a deep, funky foundation. A woman named Imani
moved effortlessly to the music, her body swaying in time with the
rhythm. The music seemed to flow through her, a powerful current of
energy and joy. All around her, people were dancing, singing, and
clapping, caught up in the infectious rhythm. The music was more than
just entertainment; it was a celebration of life, a connection to
their shared heritage, a vibrant expression of the soul of Lagos.


In [3]:
def space_tokenize(text: str) -> list[str]:
    """Splits a string into a list of tokens.

    Splits text on space.

    Args:
        text: The input text.

    Returns:
        A list of tokens. Returns empty list if text is empty or all spaces.
    """
    # Use `re` package so that splitting on multiple spaces also works.
    tokens = re.split(r" +", text)
    return tokens


# Tokenize an example text with the `space_tokenize` function.
space_tokenize("Kanga, a colorful printed cloth is more than just a fabric.")

['Kanga,',
 'a',
 'colorful',
 'printed',
 'cloth',
 'is',
 'more',
 'than',
 'just',
 'a',
 'fabric.']

### Build a list of tokens in the dataset

In [4]:
tokens = []
for paragraph in dataset:
  for token in space_tokenize(paragraph):
    tokens.append(token)
print("The total number of tokens are: ", len(tokens))

The total number of tokens are:  19065


### Build the vocabulary

In [9]:
def build_vocabulary(tokens: list[str]) -> list[str]:
  vocabulary = list(set(tokens)) # there are no duplicate values in a set
  return vocabulary

In [6]:
slm.test_build_vocabulary(build_vocabulary)

✅ Nice! Your answer looks correct.


In [7]:
vocabulary = build_vocabulary(tokens)

vocabulary_size = len(vocabulary)

print(
    "Total number of unique tokens in the Africa Galore dataset:"
    f" {vocabulary_size:,}"
)


Total number of unique tokens in the Africa Galore dataset: 5,260


In [8]:
vocabulary[:30]

['',
 'Algeria,',
 'zesty',
 "'king",
 'values',
 'Older',
 'curries',
 'fats.',
 'juicy',
 'congregate',
 'As',
 'Laila',
 'men,',
 'When',
 'our',
 'smell',
 'mild,',
 'news,',
 'Senegalese',
 'driest',
 'side,',
 'distant',
 'liquid.',
 'footsteps',
 'furrowed',
 'Its',
 '(79',
 'properly,',
 'rice,',
 'multiple']

### Build token_to_index

In [12]:
token_to_index = {}

for index, token in enumerate(vocabulary):
  token_to_index[token] = index

In [13]:
index_to_token = {}
for token, index in token_to_index.items():
    index_to_token[index] = token

In [14]:
slm.test_index_to_token(index_to_token, vocabulary)

✅ Nice! Your answer looks correct.


In [15]:
print("token_to_index:\n")

count = 0
first_ten_indices = []
for token, token_id in token_to_index.items():
    print(f"'{token}': {token_id}")
    first_ten_indices.append(token_id)
    count += 1
    if count == 10:
        break

print("\n\n")
print("index_to_token:\n")
for token_id in first_ten_indices:
    print(f"{token_id}: '{index_to_token[token_id]}'")

token_to_index:

'': 0
'Algeria,': 1
'zesty': 2
''king': 3
'values': 4
'Older': 5
'curries': 6
'fats.': 7
'juicy': 8
'congregate': 9



index_to_token:

0: ''
1: 'Algeria,'
2: 'zesty'
3: ''king'
4: 'values'
5: 'Older'
6: 'curries'
7: 'fats.'
8: 'juicy'
9: 'congregate'


### Encode and Decode Functions

In [16]:

def encode(text: str) -> list[int]:
    """Encodes a text sequence into a list of indices based on the vocabulary.

    Args:
        text: The input text to be encoded.

    Returns:
        A list of indices corresponding to the tokens in the input text.
    """

    # Convert tokens into indices.
    indices = []
    for token in space_tokenize(text):
        token_index = token_to_index.get(token)
        indices.append(token_index)

    return indices


def decode(indices: int | list[int]) -> list[str]:
    """Decodes a list (or single index) of integers back into tokens.

    Args:
        indices: A single index or a list of indices to be decoded into tokens.

    Returns:
        str: A string of decoded tokens corresponding to the input indices.
    """

    # If a single integer is passed, convert it into a list.
    if isinstance(indices, int):
        indices = [indices]

    # Map indices to tokens.
    tokens = []
    for index in indices:
        token = index_to_token.get(index)
        tokens.append(token)

    # Join the decoded tokens into a single string.
    return " ".join(tokens)

In [17]:
text = dataset[0]
print(text)

The Lagos air was thick with humidity, but the energy in the club was electric. The band launched into a hypnotic Afrobeat groove, the drums pounding out a complex polyrhythm, the horns blaring a soaring melody, and the bass laying down a deep, funky foundation. A woman named Imani moved effortlessly to the music, her body swaying in time with the rhythm. The music seemed to flow through her, a powerful current of energy and joy. All around her, people were dancing, singing, and clapping, caught up in the infectious rhythm. The music was more than just entertainment; it was a celebration of life, a connection to their shared heritage, a vibrant expression of the soul of Lagos.


In [18]:
encode(text)[:10]

[2289, 2873, 1198, 2852, 84, 4303, 183, 2199, 2415, 3629]

In [19]:
decode(encode(text)[:10])

'The Lagos air was thick with humidity, but the energy'

In [20]:
class SimpleWordTokenizer:
    """A simple word tokenizer that can be initialized with a corpus of texts
       or using a provided vocabulary list.

    The tokenizer splits the text sequence based on spaces,
    using the `encode` method to convert the text into a sequence of indices
    and the `decode` method to convert indices back into text.

    Typical usage example:

        corpus = "Hello there!"
        tokenizer = SimpleWordTokenizer(corpus)
        print(tokenizer.encode('Hello'))

    """

    def __init__(self, corpus: list[str], vocabulary: list[str] | None = None):
        """Initializes the tokenizer with texts in corpus or with a vocabulary.

        Args:
            corpus: Input text dataset.
            vocabulary: A pre-defined vocabulary. If None,
                the vocabulary is automatically inferred from the texts.
        """

        if vocabulary is None:
            # Build the vocabulary from scratch.
            if isinstance(corpus, str):
                corpus = [corpus]

            # Convert text sequence to tokens.
            tokens = []
            for text in corpus:
                for token in self.space_tokenize(text):
                    tokens.append(token)

            # Create a vocabulary comprising of unique tokens.
            self.vocabulary = self.build_vocabulary(tokens)

        else:
            self.vocabulary = vocabulary

        # Size of vocabulary.
        self.vocabulary_size = len(self.vocabulary)

        # Create token-to-index and index-to-token mappings.
        self.token_to_index = {}
        self.index_to_token = {}
        # Loop through all tokens in the vocabulary. enumerate automatically
        # assigns a unique index to each token.
        for index, token in enumerate(self.vocabulary):
            self.token_to_index[token] = index
            self.index_to_token[index] = token

    def space_tokenize(self, text: str) -> list[str]:
        """Splits a given text on space into tokens.

        Args:
            text: Text to split on space.

        Returns:
            List of tokens after splitting `text`.
        """

        # Use re.split such that multiple spaces are treated as a single
        # separator.
        return re.split(" +", text)

    def join_text(self, text_list: list[str]) -> str:
        """Combines a list of tokens into a single string, with tokens separated
           by spaces.

        Args:
            text_list: List of tokens to be joined.

        Returns:
            String with all tokens joined with a space.

        """
        return " ".join(text_list)

    def build_vocabulary(self, tokens: list[str]) -> list[str]:
        """Create a vocabulary list from the list of tokens.

        Args:
            tokens: The list of tokens in the dataset.

        Returns:
            List of unique tokens (vocabulary) in the dataset.
        """
        return sorted(list(set(tokens)))

    def encode(self, text: str) -> list[int]:
        """Encodes a text sequence into a list of indices.

        Args:
            text: The input text to be encoded.

        Returns:
            A list of indices corresponding to the tokens in the input text.
        """

        # Convert tokens into indices.
        indices = []
        for token in self.space_tokenize(text):
            token_index = self.token_to_index.get(token)
            indices.append(token_index)

        return indices

    def decode(self, indices: int | list[int]) -> str:
        """Decodes a list (or single index) of integers back into tokens.

        Args:
            indices: A single index or a list of indices to be decoded into
                tokens.

        Returns:
            str: A string of decoded tokens corresponding to the input indices.
        """

        # If a single integer is passed, convert it into a list.
        if isinstance(indices, int):
            indices = [indices]

        # Map indices to tokens.
        tokens = []
        for index in indices:
            token = self.index_to_token.get(index)
            tokens.append(token)

        # Join the decoded tokens into a single string.
        return self.join_text(tokens)

In [21]:
tokenizer = SimpleWordTokenizer(dataset)
slm.test_simple_word_tokenizer(tokenizer, vocabulary, dataset)

✅ Nice! The tokenizer seems to be working correctly.
