<a href="https://colab.research.google.com/github/comchem/Pytorch_DeepLearning_Tutorials/blob/master/Sec7-NLP_ANN/2-Text_Classification_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torchtext.legacy.data as ttd
from torchtext.vocab import GloVe
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
!git clone https://github.com/comchem/Pytorch_DeepLearning_Tutorials.git

Cloning into 'Pytorch_DeepLearning_Tutorials'...
remote: Enumerating objects: 173, done.[K
remote: Counting objects: 100% (173/173), done.[K
remote: Compressing objects: 100% (169/169), done.[K
remote: Total 173 (delta 76), reused 3 (delta 0), pack-reused 0[K
Receiving objects: 100% (173/173), 2.66 MiB | 11.05 MiB/s, done.
Resolving deltas: 100% (76/76), done.


In [3]:
path = './Pytorch_DeepLearning_Tutorials/data/spam.csv'
df = pd.read_csv(path, encoding='ISO-8859-1')

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
# drop unnecessary columns
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

In [7]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# rename columns to something better
df.columns = ['labels', 'data']

In [9]:
df.head()

Unnamed: 0,labels,data
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
# create binary labels
df['b_labels'] = df['labels'].map({'ham':0, 'spam':1})

In [11]:
df2 = df[['data', 'b_labels']]

In [12]:
df2.head()

Unnamed: 0,data,b_labels
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [13]:
df2.to_csv('spam2.csv', index=False)

In [14]:
TEXT = ttd.Field(
    sequential = True,
    batch_first = True,
    lower = True,
    # tokenize = 'spacy',
    pad_first = True
)
LABEL = ttd.Field(sequential = False, use_vocab = False, is_target = True)

# Note: if you don't specify use_vocab = False, then Pytorch will complain later when you try 
# to iterate over the dataset that the attribute 'vocab' doesn't exist.

# Note 2: if you don't specify is_target = True, then Pytorch will assume it's part of the input,
# so when you iterate over the dataset it will be like:
# for (inputs, targets), _ in iterator:
# where the 2nd element (_) should have been the target.

dataset = ttd.TabularDataset(
    path = 'spam2.csv',
    format = 'csv',
    skip_header = True,
    fields = [('data', TEXT), ('label', LABEL)]
)

In [16]:

train_dataset, test_dataset = dataset.split()  # default is 0.7 

In [17]:
TEXT.build_vocab(train_dataset)

In [18]:
vocab = TEXT.vocab
type(vocab)

torchtext.vocab.Vocab

In [19]:
vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7f5a9c3ea410>>,
            {'<unk>': 0,
             '<pad>': 1,
             'i': 2,
             'to': 3,
             'you': 4,
             'a': 5,
             'the': 6,
             'u': 7,
             'and': 8,
             'is': 9,
             'in': 10,
             'my': 11,
             'for': 12,
             'your': 13,
             'me': 14,
             'of': 15,
             'have': 16,
             'call': 17,
             'on': 18,
             'are': 19,
             'that': 20,
             'it': 21,
             '2': 22,
             'so': 23,
             'but': 24,
             'or': 25,
             'not': 26,
             'at': 27,
             'ur': 28,
             'do': 29,
             "i'm": 30,
             'if': 31,
             'be': 32,
             'get': 33,
             'can': 34,
             'will': 35,
             'just': 36,
             'with': 37,
     

In [20]:
vocab.itos

['<unk>',
 '<pad>',
 'i',
 'to',
 'you',
 'a',
 'the',
 'u',
 'and',
 'is',
 'in',
 'my',
 'for',
 'your',
 'me',
 'of',
 'have',
 'call',
 'on',
 'are',
 'that',
 'it',
 '2',
 'so',
 'but',
 'or',
 'not',
 'at',
 'ur',
 'do',
 "i'm",
 'if',
 'be',
 'get',
 'can',
 'will',
 'just',
 'with',
 'we',
 'this',
 'when',
 'all',
 'from',
 'up',
 'no',
 'what',
 'go',
 '&lt;#&gt;',
 '4',
 'got',
 'how',
 'now',
 'was',
 'like',
 'know',
 'am',
 'free',
 'out',
 'then',
 'come',
 'only',
 'its',
 'good',
 'he',
 'want',
 'send',
 '?',
 'as',
 'text',
 '.',
 'going',
 "i'll",
 'love',
 'time',
 'about',
 'by',
 'n',
 'one',
 'still',
 'see',
 'txt',
 'need',
 '...',
 'ok',
 'r',
 'tell',
 "don't",
 'our',
 'they',
 'new',
 'she',
 'there',
 'dont',
 'any',
 'been',
 'mobile',
 'day',
 'did',
 'please',
 'back',
 'some',
 'think',
 'an',
 'home',
 'has',
 'stop',
 'hi',
 'take',
 'hope',
 'reply',
 'claim',
 'where',
 'had',
 'her',
 'much',
 'happy',
 'more',
 'pls',
 'ì_',
 'give',
 "it's",
 '

In [21]:
# Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [22]:
train_iter, test_iter = ttd.Iterator.splits(
    (train_dataset, test_dataset), sort_key = lambda x: len(x.data),
    batch_sizes = (2,2), device = device
)

In [23]:
for inputs, targets in train_iter:
  print("inputs:", inputs, "shape:", inputs.shape)
  print("targets:", targets, "shape:", targets.shape)
  break

inputs: tensor([[    1,  1568,   481,    13,    17,     8,    11,  8241,  5441,  2416,
           781,    41,   226, 10017,    45,     9,    13,  5070],
        [ 1701,    88,    19,    70,     3,     6,  1904,  1232,    27,   790,
            23,   533,    59,   136,    25,    91,  5040,  1063]],
       device='cuda:0') shape: torch.Size([2, 18])
targets: tensor([0, 0], device='cuda:0') shape: torch.Size([2])


In [24]:
for inputs, targets in test_iter:
  print("inputs:", inputs, "shape:", inputs.shape)
  print("targets:", targets, "shape:", targets.shape)
  break

inputs: tensor([[929],
        [ 83]], device='cuda:0') shape: torch.Size([2, 1])
targets: tensor([0, 0], device='cuda:0') shape: torch.Size([2])


In [25]:
len(vocab)

10838

In [27]:
### Define the model
class RNN(nn.Module):
    def __init__(self, n_vocab, embed_dim, n_hidden, n_rnnlayers, n_outputs):
        super(RNN, self).__init__()
        self.V = n_vocab
        self.D = embed_dim    # Pretrained vectors
        self.M = n_hidden
        self.K = n_outputs
        self.L = n_rnnlayers
        
        self.embed = nn.Embedding(self.V, self.D)
        self.rnn = nn.LSTM(
          input_size = self.D,
          hidden_size = self.M,
          num_layers = self.L,
          batch_first = True)
        self.fc = nn.Linear(self.M, self.K)
    
    def forward(self, X):
        # initial hidden states
        h0 = torch.zeros(self.L, X.size(0), self.M).to(device)
        c0 = torch.zeros(self.L, X.size(0), self.M).to(device)
        
        # embedding layer --> N x T --> N x T x D
        # turns word indexes into word vectors
        out = self.embed(X)

        # get RNN unit output --> N x T x D --> N x T x M
        out, _ = self.rnn(out, (h0, c0))

        # max pool --> N x T x M --> N x M
        out, _ = torch.max(out, 1)
        
        # we only want h(T) at the final time step
        # N x M --> N x K
        out = self.fc(out)
        return out

In [28]:
# Instantiate the model
model = RNN(n_vocab = len(vocab), embed_dim = 20, n_hidden=15, n_rnnlayers = 1, n_outputs = 1)
model.to(device)

RNN(
  (embed): Embedding(10838, 20)
  (rnn): LSTM(20, 15, batch_first=True)
  (fc): Linear(in_features=15, out_features=1, bias=True)
)