# Intro

X -> Y makes supervised machine learning. We tried with random numbers. We tried with images. Pixels are numbers. Everything is numbers.

We can treat text the same way.

In [None]:
! free -h

In [2]:
from datasets import load_dataset
from collections import Counter
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import pickle
import json
from pathlib import Path
import torch 
import torch.nn as nn

In [None]:
# Get the dataset
imdb = load_dataset("imdb")

len(imdb['train']), imdb['train'][0]

# One Hot Vectors

![One Hot](https://miro.medium.com/max/828/1*9ZuDXoc2ek-GfHE2esty5A.webp)
src - https://medium.com/intelligentmachines/word-embedding-and-one-hot-encoding-ad17b4bbe111

In [None]:
document = "A girl called Siyana had a little lamb".lower()
tokens = document.split(' ')

tokens

In [None]:
vocab = {}
for token in tokens:
    token = token.lower()
    if not token in vocab:
        vocab[token] = len(vocab)
        
vocab

In [None]:
vocab = {}
for token in tokens:
    vocab.setdefault(token, len(vocab))
        
vocab

In [None]:
one_hots = []
one_hots = np.zeros((len(vocab), len(tokens)))
for word_nr, token in enumerate(tokens):
    word_id = vocab[token]
    one_hots[word_id, word_nr] = 1
    
one_hots

## Multi-Hot Bag of Words

# IMDB

In [None]:
# Lets just work with 1000 documents for now

train_text = [instance['text'] for instance in imdb['train']]#[:1000]
train_labels = [instance['label'] for instance in imdb['train']]#[:1000]

# Preprocess (estimate: 30-40 minutes).

This is the most difficult part ^^

## 1. Tokenize Text

Document is one long string of text -> One unit (pixel) can be a word.

In [None]:
def tokenize(document):
    """
        1. lowercase everything 
    """
    document = document.replace(".", " .").replace("!", " !")
    return document.split()

In [None]:
# Test our basic tokenizer
'|'.join(tokenize(imdb['train'][0]['text']))

### Spacy Tokenizer
This one is actually useful.

### To Install It

`! pip install spacy`

`! python -m spacy download en_core_web_sm`

within jupyter cells

In [None]:
# Actually useful tokenizer
import spacy
exclude = ["parser", "tagger", "ner", "textcat", "attribute_ruler", "lemmatizer"]
nlp = spacy.load("en_core_web_sm", exclude=exclude)

def get_spacy_tokens(text):
    return [token.text for token in nlp(text)]

In [None]:
# Test Spacy Tokenizer
tokens = get_spacy_tokens(train_text[0])
'|'.join(tokens)

In [None]:
! free -h

In [None]:
# This takes 2-5 minutes. We'll talk till then ^^'

train_docs = list(nlp.pipe(train_text))
tokenized_train_text = [[tok.text for tok in doc] for doc in train_docs]

## 2. Create Vocabulary

In [None]:
# The same setdefault stuff we did above
vocab = {}
for document in tqdm(tokenized_train_text):
    for token in document:
        vocab.setdefault(token, len(vocab))
    
len(vocab)

### That's way too many words. 121064?

Let's make sure we have only 10000 words. First 10000 words?
NO! The most common 10000 words

How?
- count the frequency of all the tokens
- sort it and choose top 10,000
- turn text to IDs based on this. For the rejected words, turn them into something like 'UNKNOWN'.

In [None]:
# Understanding Counters
counter = Counter()

counter.update(['the', 'red', 'pill'])
print(counter)
counter.update(['the', 'blue', 'gill'])
print(counter)

In [None]:
counter = Counter()
for document in tqdm(tokenized_train_text):
    counter.update(document)

In [None]:
len(counter), counter.most_common(10)

In [None]:
n_words = 10000

In [None]:
# Lets create the actual vocab now. 
# We need one special word for 'UNKNOWN': those words that our 'out of vocabulary' for us
# and for 'PADDING': when a sequence is less than the seuqence length we decided
vocab = {'--UNK--': 0, '--PAD--': 1} 

for i, (k,v) in enumerate(counter.most_common(n_words)):
    vocab.setdefault(k, len(vocab))

In [None]:
n_words = n_words + 2 # for special characters

!! **Good idea to go through the vocabulary, spot the fishy ones and re-adapt your preprocessing to take care of them.**


## 3. Coverting tokens to word IDs


In [None]:
wordid_train_text = [[vocab.get(tok, vocab['--UNK--']) for tok in doc] for doc in tokenized_train_text]
# bow_train_text = [list(set(doc)) for doc in wordid_train_text]

In [None]:
! free -h

### 3.2 Do the same for test text


In [None]:
test_text = [instance['text'] for instance in imdb['test']]#[:1000]
test_labels = [instance['label'] for instance in imdb['test']]#[:1000]
test_docs = list(nlp.pipe(test_text))
tokenized_test_text = [[tok.text for tok in doc] for doc in test_docs]
wordid_test_text = [[vocab.get(tok, vocab['--UNK--']) for tok in doc] for doc in tokenized_test_text]


## 4. Dump this stuff to disk

Next step is a transformation where we lose information (i.e. cant get sequence back from Bag of Words)

In [None]:
# Dump the WordID and vocab to disk
dump_dir = Path('../resources/datasets/imdb/wordid_vocab')
dump_dir.mkdir(parents=True, exist_ok=True)
with (dump_dir/'vocab.json').open('w+') as f:
    json.dump(vocab, f)
    
with (dump_dir/'wordids_train.pkl').open('wb+') as f:
    pickle.dump(wordid_train_text, f)
    
with (dump_dir/'train_labels.pkl').open('wb+') as f:
    pickle.dump(train_labels, f)
    
with (dump_dir/'wordids_test.pkl').open('wb+') as f:
    pickle.dump(wordid_test_text, f)
    
with (dump_dir/'test_labels.pkl').open('wb+') as f:
    pickle.dump(test_labels, f)

In [7]:
# Try loading from disk
dump_dir = Path('../resources/datasets/imdb/wordid_vocab')
with (dump_dir/'vocab.json').open('r') as f:
    vocab = json.load(f)
    
with (dump_dir/'wordids_train.pkl').open('rb') as f:
    wordid_train_text = pickle.load(f)
    
with (dump_dir/'train_labels.pkl').open('rb') as f:
    train_labels = pickle.load(f)
    
with (dump_dir/'wordids_test.pkl').open('rb') as f:
    wordid_test_text = pickle.load(f)
    
with (dump_dir/'test_labels.pkl').open('rb') as f:
    test_labels = pickle.load(f)
    
n_words = len(vocab)

## 5. Bag of Words

We don't need sequences. 

In [8]:
# Case 1: Only one-hot representation

In [9]:
X = np.zeros((len(wordid_train_text), n_words), dtype=np.float32)
Y = np.asarray(train_labels, dtype=np.float32).reshape(-1, 1)

for i, wordid_document in enumerate(wordid_train_text):
    for token_id in wordid_document:
        X[i][token_id] = 1
    
    
X.max(), X.mean()

(1.0, 0.013863911)

In [10]:
# Dump this to disk
dump_dir = dump_dir.parent / 'bow_onehot'
dump_dir.mkdir(parents=True, exist_ok=True)
with (dump_dir / 'X_train.np').open('wb+') as f:
    np.save(f, X)
    
with (dump_dir / 'Y_train.np').open('wb+') as f:
    np.save(f, Y)

In [11]:
# Do the same for test stuff
# Overwriting variable names to conserve RAM
X = np.zeros((len(wordid_test_text), n_words), dtype=np.float32)
Y = np.asarray(test_labels, dtype=np.float32).reshape(-1, 1)

for i, wordid_document in enumerate(wordid_test_text):
    for token_id in wordid_document:
        X[i][token_id] = 1
    
    
X.max(), X.mean()

(1.0, 0.013560608)

In [12]:
# Dump this to disk
dump_dir = dump_dir.parent / 'bow_onehot'
dump_dir.mkdir(parents=True, exist_ok=True)
with (dump_dir / 'X_test.np').open('wb+') as f:
    np.save(f, X)
    
with (dump_dir / 'Y_test.np').open('wb+') as f:
    np.save(f, Y)

In [13]:
# Case 2: MultiHot Representations (with frequencies)

In [14]:
X = np.zeros((len(wordid_train_text), n_words), dtype=np.float32)
Y = np.asarray(train_labels, dtype=np.float32).reshape(-1, 1)

for i, wordid_document in enumerate(wordid_train_text):
    for token_id in wordid_document:
        X[i][token_id] += 1
    
X.max(), X.mean()

(454.0, 0.02724074)

In [15]:
# Dump this to disk
dump_dir = dump_dir.parent / 'bow_multihot'
dump_dir.mkdir(parents=True, exist_ok=True)
with (dump_dir / 'X_train.np').open('wb+') as f:
    np.save(f, X)
    
with (dump_dir / 'Y_train.np').open('wb+') as f:
    np.save(f, Y)

In [16]:
# Do the same for test stuff
# Overwriting variable names to conserve RAM
X = np.zeros((len(wordid_test_text), n_words), dtype=np.float32)
Y = np.asarray(test_labels, dtype=np.float32).reshape(-1, 1)

for i, wordid_document in enumerate(wordid_test_text):
    for token_id in wordid_document:
        X[i][token_id] += 1
    
    
X.max(), X.mean()

(294.0, 0.02662703)

In [17]:
# Dump this to disk
dump_dir = dump_dir.parent / 'bow_multihot'
dump_dir.mkdir(parents=True, exist_ok=True)
with (dump_dir / 'X_test.np').open('wb+') as f:
    np.save(f, X)
    
with (dump_dir / 'Y_test.np').open('wb+') as f:
    np.save(f, Y)