<a href="https://colab.research.google.com/github/eriksali/DNN_2023_NLP/blob/main/NLP13_15.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP Class Lecture 13-15 Code Examples

Oakland University, W 23, Prof. Wilson

## Simple NN with pytorch

In [None]:
import torch
import torch.nn as nn

In [None]:
model = torch.nn.Sequential( nn.Linear(2,2,bias=False), nn.Linear(2,1,bias=False), nn.Sigmoid())

In [None]:
with torch.no_grad():
    model[0].weight[0,0] = .3 
    model[0].weight[1,0] = -.1 
    model[0].weight[0,1] = .1 
    model[0].weight[1,1] = .2 
    model[1].weight[0,0] = .1 
    model[1].weight[0,1] = -.05 

In [None]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [None]:
model.forward(x)

In [None]:
x = torch.Tensor([2,1])
h = model[0](x)
z = model[1](h)
o = model[2](z)
out = o
print(x)
print(model[0].weight)
print(h)
print(z)
print(o)

tensor([2., 1.])
Parameter containing:
tensor([[ 0.3000,  0.1000],
        [-0.1000,  0.2000]], requires_grad=True)
tensor([0.7000, 0.0000], grad_fn=<SqueezeBackward3>)
tensor([0.0700], grad_fn=<SqueezeBackward3>)
tensor([0.5175], grad_fn=<SigmoidBackward0>)


In [None]:
loss = loss_fn(out, torch.Tensor([1.0]))
loss

tensor(0.6588, grad_fn=<BinaryCrossEntropyBackward0>)

In [None]:
loss.backward()

In [None]:
print('layer 2 gradient:',model[1].weight.grad)
print('layer 1 gradient:',model[0].weight.grad)

layer 2 gradient: tensor([[-0.3378,  0.0000]])
layer 1 gradient: tensor([[-0.0965, -0.0483],
        [ 0.0483,  0.0241]])


In [None]:
optimizer.step()
print('new layer 1:',model[0].weight)
print("new layer 2:",model[1].weight) 

new layer 1: Parameter containing:
tensor([[ 0.3001,  0.1000],
        [-0.1000,  0.2000]], requires_grad=True)
new layer 2: Parameter containing:
tensor([[ 0.1003, -0.0500]], requires_grad=True)


### Scratch space

In [None]:
import numpy as np
def logistic(z, derivative=False):
    if not derivative:
        return 1 / (1 + np.exp(-z))
    else:
        return logistic(z) * (1 - logistic(z))
logistic(-.3153,True)

0.2438881376106578

In [None]:
t3 = torch.Tensor([0.6,0])
sm = torch.softmax(t3,dim=0)
.6457*.1 + .3543*-0.05
torch.sigmoid(torch.Tensor([.046855]))

In [None]:
def getBack(var_grad_fn):
    print(var_grad_fn)
    for n in var_grad_fn.next_functions:
        if n[0]:
            try:
                tensor = getattr(n[0], 'variable')
                print(n[0])
                print('Tensor with grad found:', tensor)
                print(' - gradient:', tensor.grad)
                print()
            except AttributeError as e:
                getBack(n[0])

getBack(loss.grad_fn)

## Word embeddings

In [None]:
! pip install --upgrade gensim

In [None]:
import gensim
gensim.__version__

'4.3.0'

In [None]:
# download pretrained embeddings

import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [None]:
for index, word in enumerate(wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(wv.index_to_key)} is {word}")

word #0/3000000 is </s>
word #1/3000000 is in
word #2/3000000 is for
word #3/3000000 is that
word #4/3000000 is is
word #5/3000000 is on
word #6/3000000 is ##
word #7/3000000 is The
word #8/3000000 is with
word #9/3000000 is said


In [None]:
vec_king = wv['king']
print(vec_king)

[ 1.25976562e-01  2.97851562e-02  8.60595703e-03  1.39648438e-01
 -2.56347656e-02 -3.61328125e-02  1.11816406e-01 -1.98242188e-01
  5.12695312e-02  3.63281250e-01 -2.42187500e-01 -3.02734375e-01
 -1.77734375e-01 -2.49023438e-02 -1.67968750e-01 -1.69921875e-01
  3.46679688e-02  5.21850586e-03  4.63867188e-02  1.28906250e-01
  1.36718750e-01  1.12792969e-01  5.95703125e-02  1.36718750e-01
  1.01074219e-01 -1.76757812e-01 -2.51953125e-01  5.98144531e-02
  3.41796875e-01 -3.11279297e-02  1.04492188e-01  6.17675781e-02
  1.24511719e-01  4.00390625e-01 -3.22265625e-01  8.39843750e-02
  3.90625000e-02  5.85937500e-03  7.03125000e-02  1.72851562e-01
  1.38671875e-01 -2.31445312e-01  2.83203125e-01  1.42578125e-01
  3.41796875e-01 -2.39257812e-02 -1.09863281e-01  3.32031250e-02
 -5.46875000e-02  1.53198242e-02 -1.62109375e-01  1.58203125e-01
 -2.59765625e-01  2.01416016e-02 -1.63085938e-01  1.35803223e-03
 -1.44531250e-01 -5.68847656e-02  4.29687500e-02 -2.46582031e-02
  1.85546875e-01  4.47265

In [None]:
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'car'	'communism'	0.06


In [None]:
print(wv.most_similar(positive=['car', 'minivan'], topn=5))

[('SUV', 0.8532192707061768), ('vehicle', 0.8175783753395081), ('pickup_truck', 0.7763688564300537), ('Jeep', 0.7567334175109863), ('Ford_Explorer', 0.7565720081329346)]


In [None]:
print(wv.doesnt_match(['fire', 'water', 'land', 'sea', 'air', 'car']))

car


In [None]:
print(wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=5))

[('queen', 0.7118193507194519), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321839332581)]


In [None]:
print(wv.n_similarity( "I was at the store".split(), "You did some shopping".split()))
print(wv.n_similarity( "I was at the store".split(), "She ate an apple".split()))

0.61323637
0.46933332


## Building a Neural Language Model

In [None]:
# if not ready we can just random init this layer and train it with the LM
# embedding_layer = nn.Embedding(vocab_size, emb_dim)

import torch
import torch.nn as nn

class my_LM(torch.nn.Module):

    def __init__(self, vocab_size, emb_dim, hidden_size, context_size=3, embs=None):
        super(my_LM, self).__init__()
        self.embedding_layer = nn.Embedding(vocab_size, emb_dim)
        if embs:
            self.embedding_layer = nn.Embedding.from_pretrained(embs)
        self.linear1 = nn.Linear(emb_dim * context_size, hidden_size)
        self.sigmoid = nn.Sigmoid()
        self.linear2 = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.Softmax(dim=0)

    def forward(self, x):
        # flatten into a 1d output, concatenating vectors
        # from each embedding in the input
        x = torch.flatten(self.embedding_layer(x), start_dim=-2)
        x = self.linear1(x)
        x = self.sigmoid(x)
        x = self.linear2(x)
        x = self.softmax(x)
        return x

# quick test
test_LM = my_LM(10000, 300, 100)
test_LM.forward(torch.LongTensor([1,2,3]))

tensor([1.0642e-04, 1.1192e-04, 1.0251e-04,  ..., 1.2204e-04, 1.5753e-04,
        9.9835e-05], grad_fn=<SoftmaxBackward0>)

In [None]:
# simple tokenization/normalization 

import nltk
nltk.download('punkt')
import string
import re

# Don't remove stopwords this time!
# document is a string containing 1 or more sentences
# returns a list of all of the tokens in the document
def tokenize(document, context_size = 3):
    doc_tokens = []
    # use nltk sentence tokenization
    sentences = nltk.sent_tokenize(document)
    for sentence in sentences:
        # use nltk word tokenization
        sent_tokens = nltk.word_tokenize(sentence)
        # lowercase and remove empty strings, stopwords, and numbers (all punctuation will become empty after previous line)
        sent_tokens = [word.lower() for word in sent_tokens if word]
        # either use char ngrams or full words
        doc_tokens += ['<s>']*context_size + sent_tokens + ['</s>']*context_size
    return doc_tokens

tokenize("This is part of some article. This might be on wikipedia! He said 'wow'!")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['<s>',
 '<s>',
 '<s>',
 'this',
 'is',
 'part',
 'of',
 'some',
 'article',
 '.',
 '</s>',
 '</s>',
 '</s>',
 '<s>',
 '<s>',
 '<s>',
 'this',
 'might',
 'be',
 'on',
 'wikipedia',
 '!',
 '</s>',
 '</s>',
 '</s>',
 '<s>',
 '<s>',
 '<s>',
 'he',
 'said',
 "'wow",
 "'",
 '!',
 '</s>',
 '</s>',
 '</s>']

In [None]:
# load a corpus and make the training examples
!pip install datasets
!pip install apache_beam
from datasets import load_dataset
wiki_data = load_dataset("wikipedia", "20220301.simple")
# bidirectional dictionary to save us from creating 2
! pip install bidict
from bidict import bidict
# each will be context_size tokens followed by the next token
context_size = 3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/11

Downloading builder script:   0%|          | 0.00/35.9k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/30.4k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading and preparing dataset wikipedia/20220301.simple to /root/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559...


Downloading:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/235M [00:00<?, ?B/s]

Dataset wikipedia downloaded and prepared to /root/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bidict
  Downloading bidict-0.22.1-py3-none-any.whl (35 kB)
Installing collected packages: bidict
Successfully installed bidict-0.22.1


In [None]:
from tqdm import tqdm
token2id = bidict()
next_avail_token_id = 0
X = []
Y = []
subset = 5000
for text in tqdm(wiki_data['train'][:1000]['text']):
    tokens = tokenize(text)
    # sliding window over the tokens, size is context_size + 1 
    # (since we also need the next word to be predicted)
    for i in range(len(tokens) - (context_size + 2)):
        input_tokens = tokens[i: i + context_size]
        next_token = tokens[i+context_size]
        for token in input_tokens + [next_token]:
            if token not in token2id:
                token2id[token] = next_avail_token_id
                next_avail_token_id +=1
        x = [token2id[t] for t in input_tokens]
        y = token2id[next_token]
        X.append(x)
        Y.append(y)
print(X[0])
print(Y[0])

100%|██████████| 1000/1000 [00:12<00:00, 82.72it/s]

[0, 0, 0]
1





In [None]:
# show what the training data looks like
for i in range(10):
    print(' '.join([token2id.inverse[tid] for tid in X[i]] + ['-->'] + [token2id.inverse[Y[i]]]))

<s> <s> <s> --> april
<s> <s> april --> is
<s> april is --> the
april is the --> fourth
is the fourth --> month
the fourth month --> of
fourth month of --> the
month of the --> year
of the year --> in
the year in --> the


In [None]:
print(len(token2id))

38088


In [None]:
vocab_size = len(token2id) 
emb_dim = 300
hidden_size = 200

LM = my_LM(vocab_size, emb_dim, hidden_size, context_size)

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(LM.parameters(), lr=0.01)

In [None]:
# train the model
# how many times to pass through the entire training set
import random
epochs = 1
training_order = list(range(len(X)))
# train with SGD
for e in range(epochs):
    print("Epoch:",e)
    random.shuffle(training_order)
    for i in tqdm(training_order):
        output = LM.forward(torch.LongTensor(X[i]))
        target = torch.tensor(Y[i])
        loss = loss_fn(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Epoch: 0


  0%|          | 330/948738 [00:25<20:21:34, 12.94it/s]


KeyboardInterrupt: ignored

In [None]:
# Way too slow!! 
# Time to improve this a bit let's use mini-batches!
# instead of training on one example at a time
# train on B of them
# keep track of the gradients as you go but then just apply the updates
# at the end of the batch
# we can use tensors for this and with a GPU... this can be much much faster

dataset = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(Y))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True,
                                         pin_memory=True, num_workers=2) 

In [None]:
# let's use a GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('using',device)

using cuda:0


In [None]:
# train the model

LM = my_LM(vocab_size, emb_dim, hidden_size, context_size)
LM.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(LM.parameters(), lr=0.1)
# how many times to pass through the entire training set
epochs = 50
batch_size = 64
# train with minibatch GD
for e in range(epochs):
    print("Epoch:",e)
    # here x and y will be (b x 3) and (b x 1) tensors
    # b is the batch size
    total_epoch_loss = 0
    total_items = 0
    for x,y in tqdm(dataloader):
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        # output will be (b x vocab_size)
        output = LM.forward(x)
        loss = loss_fn(output, y)
        loss.backward()
        optimizer.step()
        total_epoch_loss += loss.item()
        total_items += batch_size
    print('loss:',total_epoch_loss/total_items)

Epoch: 0


100%|██████████| 14825/14825 [01:22<00:00, 180.33it/s]


loss: 0.16438287356953193
Epoch: 1


100%|██████████| 14825/14825 [01:22<00:00, 180.38it/s]


loss: 0.16422512037657566
Epoch: 2


100%|██████████| 14825/14825 [01:20<00:00, 184.06it/s]


loss: 0.16408362952794434
Epoch: 3


100%|██████████| 14825/14825 [01:21<00:00, 180.88it/s]


loss: 0.1639801053403803
Epoch: 4


100%|██████████| 14825/14825 [01:21<00:00, 182.40it/s]


loss: 0.16388805079620966
Epoch: 5


100%|██████████| 14825/14825 [01:21<00:00, 181.07it/s]


loss: 0.1638145394520253
Epoch: 6


100%|██████████| 14825/14825 [01:20<00:00, 184.66it/s]


loss: 0.1637465245948875
Epoch: 7


100%|██████████| 14825/14825 [01:22<00:00, 180.16it/s]


loss: 0.16367896585902716
Epoch: 8


100%|██████████| 14825/14825 [01:21<00:00, 182.67it/s]


loss: 0.16362880201001995
Epoch: 9


100%|██████████| 14825/14825 [01:21<00:00, 181.73it/s]


loss: 0.1635847229629906
Epoch: 10


100%|██████████| 14825/14825 [01:21<00:00, 180.82it/s]


loss: 0.16354874293176963
Epoch: 11


100%|██████████| 14825/14825 [01:23<00:00, 178.60it/s]


loss: 0.16351305979299305
Epoch: 12


100%|██████████| 14825/14825 [01:21<00:00, 182.77it/s]


loss: 0.16347752703382032
Epoch: 13


100%|██████████| 14825/14825 [01:22<00:00, 178.65it/s]


loss: 0.16344152895334801
Epoch: 14


100%|██████████| 14825/14825 [01:22<00:00, 180.55it/s]


loss: 0.16340975826948528
Epoch: 15


100%|██████████| 14825/14825 [01:22<00:00, 178.92it/s]


loss: 0.1633772334360956
Epoch: 16


100%|██████████| 14825/14825 [01:21<00:00, 181.43it/s]


loss: 0.16334449943764415
Epoch: 17


100%|██████████| 14825/14825 [01:21<00:00, 181.27it/s]


loss: 0.16331685352888317
Epoch: 18


100%|██████████| 14825/14825 [01:20<00:00, 184.04it/s]


loss: 0.16328809170960173
Epoch: 19


100%|██████████| 14825/14825 [01:22<00:00, 179.93it/s]


loss: 0.1632651032997424
Epoch: 20


100%|██████████| 14825/14825 [01:20<00:00, 184.29it/s]


loss: 0.16324048440331754
Epoch: 21


100%|██████████| 14825/14825 [01:21<00:00, 181.95it/s]


loss: 0.16321821248852986
Epoch: 22


100%|██████████| 14825/14825 [01:20<00:00, 184.54it/s]


loss: 0.16319604603978877
Epoch: 23


100%|██████████| 14825/14825 [01:20<00:00, 183.59it/s]


loss: 0.16317514239435052
Epoch: 24


100%|██████████| 14825/14825 [01:20<00:00, 184.24it/s]


loss: 0.1631574596542532
Epoch: 25


100%|██████████| 14825/14825 [01:20<00:00, 183.12it/s]


loss: 0.16313594387514394
Epoch: 26


100%|██████████| 14825/14825 [01:20<00:00, 183.80it/s]


loss: 0.16311909845466357
Epoch: 27


100%|██████████| 14825/14825 [01:20<00:00, 183.19it/s]


loss: 0.16310012255209494
Epoch: 28


100%|██████████| 14825/14825 [01:20<00:00, 184.66it/s]


loss: 0.16307965326470025
Epoch: 29


100%|██████████| 14825/14825 [01:23<00:00, 177.55it/s]


loss: 0.1630621127063066
Epoch: 30


100%|██████████| 14825/14825 [01:22<00:00, 180.52it/s]


loss: 0.1630456108226744
Epoch: 31


100%|██████████| 14825/14825 [01:22<00:00, 179.02it/s]


loss: 0.1630241171002991
Epoch: 32


100%|██████████| 14825/14825 [01:21<00:00, 181.08it/s]


loss: 0.1630092257170814
Epoch: 33


100%|██████████| 14825/14825 [01:22<00:00, 178.96it/s]


loss: 0.16299211635657948
Epoch: 34


100%|██████████| 14825/14825 [01:21<00:00, 181.42it/s]


loss: 0.16298006628958814
Epoch: 35


100%|██████████| 14825/14825 [01:22<00:00, 178.66it/s]


loss: 0.1629598034312713
Epoch: 36


100%|██████████| 14825/14825 [01:21<00:00, 181.55it/s]


loss: 0.1629494664801715
Epoch: 37


100%|██████████| 14825/14825 [01:21<00:00, 182.61it/s]


loss: 0.16293057858642399
Epoch: 38


100%|██████████| 14825/14825 [01:21<00:00, 181.71it/s]


loss: 0.16291683950975008
Epoch: 39


100%|██████████| 14825/14825 [01:22<00:00, 179.79it/s]


loss: 0.1629022315672236
Epoch: 40


100%|██████████| 14825/14825 [01:20<00:00, 183.62it/s]


loss: 0.16288282545782864
Epoch: 41


100%|██████████| 14825/14825 [01:20<00:00, 183.60it/s]


loss: 0.16286687061718302
Epoch: 42


100%|██████████| 14825/14825 [01:19<00:00, 187.38it/s]


loss: 0.16285131148704993
Epoch: 43


100%|██████████| 14825/14825 [01:20<00:00, 183.66it/s]


loss: 0.16283786772274447
Epoch: 44


100%|██████████| 14825/14825 [01:21<00:00, 181.25it/s]


loss: 0.16282341225553323
Epoch: 45


100%|██████████| 14825/14825 [01:20<00:00, 183.81it/s]


loss: 0.1628077636732018
Epoch: 46


100%|██████████| 14825/14825 [01:20<00:00, 184.69it/s]


loss: 0.16279435192955685
Epoch: 47


100%|██████████| 14825/14825 [01:19<00:00, 185.84it/s]


loss: 0.16278039678353282
Epoch: 48


100%|██████████| 14825/14825 [01:21<00:00, 181.51it/s]


loss: 0.1627701145681925
Epoch: 49


100%|██████████| 14825/14825 [01:20<00:00, 184.53it/s]

loss: 0.16275526301454735





In [None]:
# that's better! now let's write a function to predict the next word
# given 3 words before

def get_next_word(LM, input_tokens, token2id, greedy=False):
    with torch.no_grad():
        x = torch.tensor([token2id[t] for t in input_tokens]).to(device)
        probs = LM.forward(x)
        y = None
        if greedy:
            y = torch.argmax(probs)
        else:
            y = torch.multinomial(probs,1)
        y_int = y.to('cpu').item()
        token = token2id.inverse[y_int]
        return token

print(get_next_word(LM, ['april','is','the'], token2id, greedy=True))
print(get_next_word(LM, ['<s>','<s>','<s>'], token2id))

<s>
9-0


In [None]:
def generate_sentence(LM, token2id, greedy):
    prev_words = ['<s>','<s>','<s>']
    next_word = ""
    max_len = 50
    count = 0
    out = []
    while next_word != "</s>" and count < max_len:
        out.append(next_word)
        next_word = get_next_word(LM, prev_words, token2id, greedy)
        prev_words = prev_words[1:] + [next_word]
        count += 1
    print(" ".join(out))

generate_sentence(LM, token2id, greedy=True)

 it is a <s> of the biography of the .
