# Intro

X -> Y makes supervised machine learning. We tried with random numbers. We tried with images. Pixels are numbers. Everything is numbers.

We can treat text the same way.

In [99]:
from datasets import load_dataset
from collections import Counter
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from pathlib import Path
import torch 
import torch.nn as nn

In [2]:
# Get the dataset
imdb = load_dataset("imdb")

len(imdb['train']), imdb['train'][0]

Reusing dataset imdb (/home/priyansh/Dev/perm/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)


(25000,
 {'label': 1,
  'text': 'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'})

# One Hot Vectors

![One Hot](https://miro.medium.com/max/828/1*9ZuDXoc2ek-GfHE2esty5A.webp)
src - https://medium.com/intelligentmachines/word-embedding-and-one-hot-encoding-ad17b4bbe111

In [3]:
document = "A girl called Siyana had a little lamb".lower()
tokens = document.split(' ')

tokens

['a', 'girl', 'called', 'siyana', 'had', 'a', 'little', 'lamb']

In [4]:
vocab = {}
for token in tokens:
    token = token.lower()
    if not token in vocab:
        vocab[token] = len(vocab)
        
vocab

{'a': 0, 'girl': 1, 'called': 2, 'siyana': 3, 'had': 4, 'little': 5, 'lamb': 6}

In [5]:
vocab = {}
for token in tokens:
    vocab.setdefault(token, len(vocab))
        
vocab

{'a': 0, 'girl': 1, 'called': 2, 'siyana': 3, 'had': 4, 'little': 5, 'lamb': 6}

In [6]:
one_hots = []
one_hots = np.zeros((len(vocab), len(tokens)))
for word_nr, token in enumerate(tokens):
    word_id = vocab[token]
    one_hots[word_id, word_nr] = 1
    
one_hots

array([[1., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.]])

## Multi-Hot Bag of Words

# IMDB

In [7]:
# Lets just work with 1000 documents for now

train_text = [instance['text'] for instance in imdb['train']][:1000]
train_labels = [instance['label'] for instance in imdb['train']][:1000]

# Preprocess (estimate: 30-40 minutes).

This is the most difficult part ^^

## 1. Tokenize Text

Document is one long string of text -> One unit (pixel) can be a word.

In [8]:
def tokenize(document):
    """
        1. lowercase everything 
    """
    document = document.replace(".", " .").replace("!", " !")
    return document.split()

In [9]:
# Test our basic tokenizer
'|'.join(tokenize(imdb['train'][0]['text']))

'Bromwell|High|is|a|cartoon|comedy|.|It|ran|at|the|same|time|as|some|other|programs|about|school|life,|such|as|"Teachers"|.|My|35|years|in|the|teaching|profession|lead|me|to|believe|that|Bromwell|High\'s|satire|is|much|closer|to|reality|than|is|"Teachers"|.|The|scramble|to|survive|financially,|the|insightful|students|who|can|see|right|through|their|pathetic|teachers\'|pomp,|the|pettiness|of|the|whole|situation,|all|remind|me|of|the|schools|I|knew|and|their|students|.|When|I|saw|the|episode|in|which|a|student|repeatedly|tried|to|burn|down|the|school,|I|immediately|recalled|.|.|.|.|.|.|.|.|.|at|.|.|.|.|.|.|.|.|.|.|High|.|A|classic|line:|INSPECTOR:|I\'m|here|to|sack|one|of|your|teachers|.|STUDENT:|Welcome|to|Bromwell|High|.|I|expect|that|many|adults|of|my|age|think|that|Bromwell|High|is|far|fetched|.|What|a|pity|that|it|isn\'t|!'

### Spacy Tokenizer
This one is actually useful.

### To Install It

`! pip install spacy`

`! python -m spacy download en_core_web_sm`

within jupyter cells

In [10]:
# Actually useful tokenizer
import spacy
exclude = ["parser", "tagger", "ner", "textcat", "attribute_ruler", "lemmatizer"]
nlp = spacy.load("en_core_web_sm", exclude=exclude)

def get_spacy_tokens(text):
    return [token.text for token in nlp(text)]

In [11]:
# Test Spacy Tokenizer
tokens = get_spacy_tokens(train_text[0])
'|'.join(tokens)

'Bromwell|High|is|a|cartoon|comedy|.|It|ran|at|the|same|time|as|some|other|programs|about|school|life|,|such|as|"|Teachers|"|.|My|35|years|in|the|teaching|profession|lead|me|to|believe|that|Bromwell|High|\'s|satire|is|much|closer|to|reality|than|is|"|Teachers|"|.|The|scramble|to|survive|financially|,|the|insightful|students|who|can|see|right|through|their|pathetic|teachers|\'|pomp|,|the|pettiness|of|the|whole|situation|,|all|remind|me|of|the|schools|I|knew|and|their|students|.|When|I|saw|the|episode|in|which|a|student|repeatedly|tried|to|burn|down|the|school|,|I|immediately|recalled|.........|at|..........|High|.|A|classic|line|:|INSPECTOR|:|I|\'m|here|to|sack|one|of|your|teachers|.|STUDENT|:|Welcome|to|Bromwell|High|.|I|expect|that|many|adults|of|my|age|think|that|Bromwell|High|is|far|fetched|.|What|a|pity|that|it|is|n\'t|!'

In [12]:
# This takes 2-5 minutes. We'll talk till then ^^'

train_docs = list(nlp.pipe(train_text))
tokenized_train_text = [[tok.text for tok in doc] for doc in train_docs]

## 2. Create Vocabulary

In [41]:
# The same setdefault stuff we did above
vocab = {}
for document in tqdm(tokenized_train_text):
    for token in document:
        vocab.setdefault(token, len(vocab))
    
len(vocab)

  0%|          | 0/25000 [00:00<?, ?it/s]

121064

### That's way too many words. 121064?

Let's make sure we have only 10000 words. First 10000 words?
NO! The most common 10000 words

How?
- count the frequency of all the tokens
- sort it and choose top 10,000
- turn text to IDs based on this. For the rejected words, turn them into something like 'UNKNOWN'.

In [42]:
# Understanding Counters
counter = Counter()

counter.update(['the', 'red', 'pill'])
print(counter)
counter.update(['the', 'blue', 'gill'])
print(counter)

Counter({'the': 1, 'red': 1, 'pill': 1})
Counter({'the': 2, 'red': 1, 'pill': 1, 'blue': 1, 'gill': 1})


In [43]:
counter = Counter()
for document in tqdm(tokenized_train_text):
    counter.update(document)

  0%|          | 0/25000 [00:00<?, ?it/s]

In [44]:
len(counter), counter.most_common(10)

(121064,
 [('the', 289838),
  (',', 275296),
  ('.', 236702),
  ('and', 156484),
  ('a', 156282),
  ('of', 144056),
  ('to', 133886),
  ('is', 109095),
  ('in', 87676),
  ('I', 77546)])

In [55]:
wordcount
for count in counter:
    print(count)
    break

8

In [76]:
n_words = 10000

In [50]:
# Lets create the actual vocab now. 
# We need one special word for 'UNKNOWN': those words that our 'out of vocabulary' for us
# and for 'PADDING': when a sequence is less than the seuqence length we decided
vocab = {'--UNK--': 0, '--PAD--': 1} 

for i, (k,v) in enumerate(counter.most_common(n_words)):
    vocab.setdefault(k, len(vocab))

In [86]:
n_words = n_words + 2 # for special characters

!! **Good idea to go through the vocabulary, spot the fishy ones and re-adapt your preprocessing to take care of them.**


## 3. Coverting tokens to word IDs


In [73]:
wordid_train_text = [[vocab.get(tok, vocab['--UNK--']) for tok in doc] for doc in tokenized_train_text]
bow_train_text = [list(set(doc)) for doc in wordid_train_text]

In [96]:
! free -h

              total        used        free      shared  buff/cache   available
Mem:            15G         11G        1,9G        986M        1,8G        2,5G
Swap:          979M        676M        303M


In [97]:
X = np.zeros((len(bow_train_text), n_words))
for i, doc in enumerate(bow_train_text):
    X[i][doc] = 1
Y = np.asarray(train_labels)

X.shape, Y.shape

((25000, 10002), (25000,))

In [107]:
Path().absolute()

PosixPath('/home/priyansh/Dev/projects/dl-workinggroup/notebooks')

In [109]:
with Path('../resources/6.1.X.np').open('wb+') as f:
    np.save(f, X)
    
with Path('../resources/6.1.Y.np').open('wb+') as f:
    np.save(f, Y)
      