# Intro

X -> Y makes supervised machine learning. We tried with random numbers. We tried with images. Pixels are numbers. Everything is numbers.

We can treat text the same way.

In [None]:
from datasets import load_dataset
from collections import Counter
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from pathlib import Path

In [None]:
# Get the dataset
imdb = load_dataset("imdb")

len(imdb['train']), imdb['train'][0]

# One Hot Vectors

![One Hot](https://miro.medium.com/max/828/1*9ZuDXoc2ek-GfHE2esty5A.webp)
src - https://medium.com/intelligentmachines/word-embedding-and-one-hot-encoding-ad17b4bbe111

In [None]:
document = "Rome Paris Italy France Potato Kartoffeln Patate".lower()
tokens = document.split(' ')

tokens

In [None]:
vocab = {}
for token in tokens:
    token = token.lower()
    if not token in vocab:
        vocab[token] = len(vocab)
        
vocab

In [None]:
vocab = {}
for token in tokens:
    vocab.setdefault(token, len(vocab))
        
vocab

In [None]:
one_hots = []
one_hots = np.zeros((len(vocab), len(tokens)))
for word_nr, token in enumerate(tokens):
    word_id = vocab[token]
    one_hots[word_id, word_nr] = 1
    
one_hots

## Multi-Hot Bag of Words

# IMDB

In [None]:
# Lets just work with 1000 documents for now

train_text = 
train_labels = 

# Preprocess (estimate: 30-40 minutes).

This is the most difficult part ^^

## 1. Tokenize Text

Document is one long string of text -> One unit (pixel) can be a word.

In [None]:
def tokenize(document):
    ...

In [None]:
# Test our basic tokenizer
...

### Spacy Tokenizer
This one is actually useful.

### To Install It

`! pip install spacy`

`! python -m spacy download en_core_web_sm`

within jupyter cells

In [None]:
# Actually useful tokenizer
import spacy
exclude = ["parser", "tagger", "ner", "textcat", "attribute_ruler", "lemmatizer", "tok2vec"]
nlp = spacy.load("en_core_web_sm", exclude=exclude)

In [None]:
# Test Spacy Tokenizer on one doc
...

In [None]:
# This takes 2-5 minutes. We'll talk till then ^^'

train_docs = ... # use NLP pipe
tokenized_train_text = ...

## 1.1 Exploring the data 

- Length Distribution


In [None]:
lens = [len(doc) for doc in tokenized_train_text]
bin_ranges = [i for i in range(0, max(lens), max(lens)//50)]

#create histogram with 4 bins
print(f"Over {len(lens)} documents, the mean is {np.mean(lens):.2f} ± {np.std(lens):.2f}")
plt.figure(figsize=(14, 8))
plt.hist(lens, bins=bin_ranges, edgecolor='black')
plt.plot()

## 2. Create Vocabulary

In [None]:
# The same setdefault stuff we did above
vocab = {}
...
len(vocab)

### That's way too many words. 121064?

Let's make sure we have only 10000 words. First 10000 words?
NO! The most common 10000 words

How?
- count the frequency of all the tokens
- sort it and choose top 10,000
- turn text to IDs based on this. For the rejected words, turn them into something like 'UNKNOWN'.

In [None]:
# Understanding Counters
counter = Counter()

counter.update(['the', 'red', 'pill'])
print(counter)
counter.update(['the', 'blue', 'gill'])
print(counter)

In [None]:
# Run a counter over our tokenized dataset
counter = ...

In [None]:
# Let's see what turned out (checkout most_common)
...

In [None]:
# Plot the word frequencies to help decide on a vocabulary limit
word_counts = [count for word, count in counter.most_common()]
total_words = len(word_counts)

# Create a more informative print statement
print(f"Vocabulary size: {total_words} unique words")
print(f"Most common word appears {word_counts[0]} times")
print(f"Mean word frequency: {np.mean(word_counts):.2f} ± {np.std(word_counts):.2f}")

# Create a cleaner plotting setup
plt.figure(figsize=(12, 10))

# Plot frequency distribution on log scale (more informative for power law distributions)
plt.subplot(211)
plt.hist(word_counts, bins=50, color='steelblue', edgecolor='black')
plt.title('Word Frequency Distribution (Linear Scale)')
plt.xlabel('Word Frequency')
plt.ylabel('Number of Words')

# Plot on log scale to better visualize the long tail
plt.subplot(212)
plt.hist(word_counts, bins=np.logspace(0, np.log10(max(word_counts)), 50), 
         color='steelblue', edgecolor='black')
plt.xscale('log')
plt.title('Word Frequency Distribution (Log Scale)')
plt.xlabel('Word Frequency (log scale)')
plt.ylabel('Number of Words')

plt.tight_layout()
plt.show()

In [None]:
n_words = 10_000

In [None]:
# Lets create the actual vocab now. 
# We need one special word for 'UNKNOWN': those words that our 'out of vocabulary' for us
# and for 'PADDING': when a sequence is less than the seuqence length we decided
vocab = ...

In [None]:
n_words = n_words + 2 # for special characters

!! **Good idea to go through the vocabulary, spot the fishy ones and re-adapt your preprocessing to take care of them.**


## 3. Coverting tokens to word IDs


In [None]:
wordid_train_text = ...
bow_train_text = ...

In [None]:
# Finally, turn them into vectors and dump to disk (keep them at float32; reshape the Y)

X = ...
Y = ...

X.shape, Y.shape, X.dtype, Y.dtype

In [None]:
dump_dir = Path('../resources/datasets/imdb/wordid_vocab')
dump_dir.mkdir(parents=True, exist_ok=True)
with (dump_dir/'vocab.json').open('w+') as f:
    json.dump(vocab, f)
    
with (dump_dir/'wordids_train.pkl').open('wb+') as f:
    pickle.dump(wordid_train_text, f)
    
with (dump_dir/'train_labels.pkl').open('wb+') as f:
    pickle.dump(train_labels, f)
    
with (dump_dir/'wordids_test.pkl').open('wb+') as f:
    pickle.dump(wordid_test_text, f)
    
with (dump_dir/'test_labels.pkl').open('wb+') as f:
    pickle.dump(test_labels, f)


In [None]:
# Dump this to disk
dump_dir = dump_dir.parent / 'bow_onehot'
dump_dir.mkdir(parents=True, exist_ok=True)
with (dump_dir / 'X_train.np').open('wb+') as f:
    np.save(f, X)
    
with (dump_dir / 'Y_train.np').open('wb+') as f:
    np.save(f, Y)

In [None]:
# Repeat for all the documents
# RIP your RAM