In [1]:
!wget "https://raw.githubusercontent.com/aladdinpersson/Machine-Learning-Collection/master/ML/Pytorch/more_advanced/torchtext/mydata/train.csv"
!wget "https://raw.githubusercontent.com/aladdinpersson/Machine-Learning-Collection/master/ML/Pytorch/more_advanced/torchtext/mydata/test.csv"

--2021-06-24 00:56:20--  https://raw.githubusercontent.com/aladdinpersson/Machine-Learning-Collection/master/ML/Pytorch/more_advanced/torchtext/mydata/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 239 [text/plain]
Saving to: ‘train.csv’


2021-06-24 00:56:21 (11.7 MB/s) - ‘train.csv’ saved [239/239]

--2021-06-24 00:56:21--  https://raw.githubusercontent.com/aladdinpersson/Machine-Learning-Collection/master/ML/Pytorch/more_advanced/torchtext/mydata/test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 

Taken from : wonderful tutorial by Alladin perrson: https://www.youtube.com/watch?v=KRgq4VnCr7I&list=PLhhyoLH6IjfxeoooqP9rhU3HJIAVAJ3Vz&index=34

In [2]:
import torch
import torch.optim as optim
import torch.nn as nn
import spacy
from torchtext.data import Field, TabularDataset, BucketIterator

In [3]:
# steps

# using preprocessing on the data > field
# load dataset -> TabularDataset
# iterator for batching and padding > BucketIterator

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

spacy_en = spacy.load("en")

In [4]:
def tokenize(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
quote = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True)
score = Field(sequential=False, use_vocab=False)

fields = {"quote": ("q", quote),"score": ("s",score)}



In [6]:
train_data, test_data = TabularDataset.splits(path="./", train="train.csv", test="test.csv", format="csv", fields=fields)



In [7]:
for batch in train_data:
    print(batch.q)
    print(batch.s)

['you', 'must', 'own', 'everything', 'in', 'your', 'world', '.', 'there', 'is', 'no', 'one', 'else', 'to', 'blame', '.']
1
['do', 'not', 'pray', 'for', 'an', 'easy', 'life', ',', 'pray', 'for', 'the', 'strength', 'to', 'endure', 'a', 'difficult', 'one', '.']
1
['stand', 'tall', ',', 'and', 'rice', 'like', 'a', 'potato', '!']
0


In [8]:
quote.build_vocab(train_data, max_size=10000, min_freq=1, vectors="glove.6B.100d")

train_iterator, test_iterator = BucketIterator.splits((train_data, test_data), batch_size=2, device=device)

.vector_cache/glove.6B.zip: 862MB [02:42, 5.29MB/s]                           
100%|█████████▉| 399999/400000 [00:22<00:00, 18027.90it/s]


In [9]:
for batch in train_iterator:
    print(batch.q)
    print(batch.s)
    break

tensor([[14, 29],
        [25, 31],
        [ 7,  3],
        [ 5, 11],
        [10, 28],
        [15, 22],
        [21,  4],
        [ 3, 27],
        [ 7,  9],
        [ 5,  1],
        [32,  1],
        [30,  1],
        [ 8,  1],
        [17,  1],
        [ 4,  1],
        [13,  1],
        [ 6,  1],
        [ 2,  1]])
tensor([1, 0])




In [10]:
class simpleLSTM(nn.Module):
    def __init__(self,input_size,embed_size, hidden_size, num_layers):
        super(simpleLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(input_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers)
        self.fc1 = nn.Linear(hidden_size,1)
        
    def forward(self, x):
        
        h0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(device=device)
        c0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(device=device)
        
        out = self.embedding(x)
        out, _ = self.lstm(out, (h0, c0))
        out = self.fc1(out[-1,:,:])
        
        return out

In [11]:
num_epochs = 5
learning_rate = 0.005
input_size = len(quote.vocab)
hidden_size = 256
num_layers = 2
embed_size = 100

In [12]:
model = simpleLSTM(input_size,embed_size,hidden_size,num_layers).to(device=device)

In [13]:
pretrained_embedding = quote.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embedding)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [ 0.4918,  1.1164,  1.1424,  ..., -0.5088,  0.6256,  0.4392],
        [-0.4989,  0.7660,  0.8975,  ..., -0.4118,  0.4054,  0.7850],
        [-0.5718,  0.0463,  0.8673,  ..., -0.3566,  0.9293,  0.8995]])

In [14]:
optimizer = optim.Adam(model.parameters(),lr=learning_rate)
loss_criterion = nn.BCEWithLogitsLoss()

In [15]:
for epoch in range(num_epochs):
    
    for batch_idx, batch in enumerate(train_iterator):
        
        data = batch.q.to(device=device)
        target = batch.s.to(device=device)
        
        preds = model(data)
        loss = loss_criterion(preds.squeeze(1), target.type_as(preds))
        
        optimizer.zero_grad()
        
        loss.backward()
        
        optimizer.step()
        
    print(f"For epoch: {epoch}, loss : {loss}")

For epoch: 0, loss : 1.2036941051483154
For epoch: 1, loss : 0.9098322987556458
For epoch: 2, loss : 0.6328436732292175
For epoch: 3, loss : 0.5629074573516846
For epoch: 4, loss : 0.32545188069343567


In [16]:
for batch in train_iterator:
    print(f"For input {batch.q}")
    preds = model(batch.q.to(device=device))
    
    for p in preds.squeeze(0):
        print(quote.vocab[p])
        
    print(f"Output {preds}")

For input tensor([[29],
        [31],
        [ 3],
        [11],
        [28],
        [22],
        [ 4],
        [27],
        [ 9]])
0
Output tensor([[-0.1909]], grad_fn=<AddmmBackward>)
For input tensor([[14, 35],
        [25, 23],
        [ 7, 26],
        [ 5, 18],
        [10, 19],
        [15, 36],
        [21, 34],
        [ 3,  2],
        [ 7, 33],
        [ 5, 20],
        [32, 24],
        [30,  6],
        [ 8, 16],
        [17,  8],
        [ 4, 12],
        [13,  2],
        [ 6,  1],
        [ 2,  1]])
0
0
Output tensor([[2.0407],
        [1.5301]], grad_fn=<AddmmBackward>)
