In [9]:
from itertools import islice
from collections import defaultdict

import pandas as pd
import numpy as np

## Corpus

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("manashjyotiborah/top-10000-movies-hosted-on-tmdb")

Downloading from https://www.kaggle.com/api/v1/datasets/download/manashjyotiborah/top-10000-movies-hosted-on-tmdb?dataset_version_number=2...


100%|██████████| 16.9M/16.9M [00:00<00:00, 91.8MB/s]

Extracting files...





In [3]:
overview_df = pd.read_csv(f'{path}/movies_dataset.csv', index_col='id')
overview_df.dropna(axis=0, inplace=True)

corpus = overview_df['overview'].to_list()
corpus[:3]

["Over many missions and against impossible odds, Dom Toretto and his family have outsmarted, out-nerved and outdriven every foe in their path. Now, they confront the most lethal opponent they've ever faced: A terrifying threat emerging from the shadows of the past who's fueled by blood revenge, and who is determined to shatter this family and destroy everything—and everyone—that Dom loves, forever.",
 "Tasked with extracting a family who is at the mercy of a Georgian gangster, Tyler Rake infiltrates one of the world's deadliest prisons in order to save them. But when the extraction gets hot, and the gangster dies in the heat of battle, his equally ruthless brother tracks down Rake and his team to Sydney, in order to get revenge.",
 'With the price on his head ever increasing, John Wick uncovers a path to defeating The High Table. But before he can earn his freedom, Wick must face off against a new enemy with powerful alliances across the globe and forces that turn old friends into foe

## Tokenization

In [4]:
end_of_word = '/w'

In [5]:
unique_chars = set()
for overview in corpus:
    for char in overview:
        unique_chars.add(char)

vocab = list(unique_chars)
vocab.sort()
vocab.append(end_of_word)

vocab[:10], len(vocab)

(['\r', ' ', '!', '"', '#', '$', '%', '&', "'", '('], 129)

In [6]:
word_splits = defaultdict(int)
for overview in corpus:
    words = overview.split()
    for word in words:
        if word:
            char_list = [word] + [end_of_word]
            word_tuple = tuple(char_list)

            word_splits[word_tuple] += 1


len(word_splits)

49896

In [10]:
list(islice(word_splits, 10))

[('Over', '/w'),
 ('many', '/w'),
 ('missions', '/w'),
 ('and', '/w'),
 ('against', '/w'),
 ('impossible', '/w'),
 ('odds,', '/w'),
 ('Dom', '/w'),
 ('Toretto', '/w'),
 ('his', '/w')]

In [11]:
word_splits = {}
for doc in corpus:
    words = doc.split(' ')
    for word in words:
        if word:
            char_list = list(word) + [end_of_word]
            word_tuple = tuple(char_list)
            if word_tuple not in word_splits:
                 word_splits[word_tuple] = 0
            word_splits[word_tuple] += 1

list(islice(word_splits, 10))

[('O', 'v', 'e', 'r', '/w'),
 ('m', 'a', 'n', 'y', '/w'),
 ('m', 'i', 's', 's', 'i', 'o', 'n', 's', '/w'),
 ('a', 'n', 'd', '/w'),
 ('a', 'g', 'a', 'i', 'n', 's', 't', '/w'),
 ('i', 'm', 'p', 'o', 's', 's', 'i', 'b', 'l', 'e', '/w'),
 ('o', 'd', 'd', 's', ',', '/w'),
 ('D', 'o', 'm', '/w'),
 ('T', 'o', 'r', 'e', 't', 't', 'o', '/w'),
 ('h', 'i', 's', '/w')]

In [12]:
def get_pair_stats(splits):
    pair_counts = defaultdict(int)
    for word_tuple, freq in splits.items():
        symbols = list(word_tuple)
        for i in range(len(symbols) - 1):
            pair = (symbols[i], symbols[i+1])
            pair_counts[pair] += freq
    return pair_counts

pair_stats = get_pair_stats(word_splits)

list(islice(pair_stats, 10))

[('O', 'v'),
 ('v', 'e'),
 ('e', 'r'),
 ('r', '/w'),
 ('m', 'a'),
 ('a', 'n'),
 ('n', 'y'),
 ('y', '/w'),
 ('m', 'i'),
 ('i', 's')]