In [1]:
!pip install pytorch-nlp
from torchnlp.datasets import imdb_dataset
from tqdm import tqdm
import re
from nltk import tokenize
import nltk
import pickle
from google.colab import drive
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random
import numpy as np
################################################################################
drive.mount('/content/drive', force_remount=True)
nltk.download('punkt')
train = imdb_dataset(train=True)
test = imdb_dataset(test=True)
################################################################################
class MyDataset(Dataset):
  def __init__(self, x, y):
    self.data = x
    self.labels = y

  def __len__(self):
    return len(self.labels)
  
  def __getitem__(self, index):
    return self.data[index], self.labels[index]
################################################################################
class Accuracy:
    """A class to keep track of the accuracy while training"""
    def __init__(self):
        self.correct = 0
        self.total = 0
        
    def reset(self):
        """Resets the internal state"""
        self.correct = 0
        self.total = 0
        
    def update(self, outputs, labels):
        """
        Updates the internal state to later compute the overall accuracy
        
        output: the output of the network for a batch
        labels: the target labels
        """
        _, predicted = torch.max(outputs.data, 1) # predicted now contains the predicted class index/label
        
        self.total += labels.size(0)
        self.correct += (predicted == labels).sum().item() # .item() gets the number, not the tensor
        #self.correct += ((outputs.data > 0.5) == labels).sum().item()

    def compute(self):
        return self.correct/self.total
################################################################################
def eval_accu():
  net.eval()
  eval_accuracy = Accuracy()
  eval_accuracy.reset()        
  with torch.no_grad():
      for eval_data in val_loader:
          # get the data points
          eval_inputs, eval_labels = eval_data
          eval_inputs, eval_labels = eval_inputs.to(device).long(), eval_labels.to(device)
          # forward the data through the network
          eval_outputs = net(eval_inputs)
          
          eval_accuracy.update(eval_outputs, eval_labels)
          
  print("\nTesting Accuracy: {:.2f}%".format(100 * eval_accuracy.compute()))
################################################################################

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-nlp
  Downloading pytorch_nlp-0.5.0-py3-none-any.whl (90 kB)
[K     |████████████████████████████████| 90 kB 9.1 MB/s 
Installing collected packages: pytorch-nlp
Successfully installed pytorch-nlp-0.5.0
Mounted at /content/drive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
aclImdb_v1.tar.gz: 84.1MB [00:09, 8.51MB/s]                            


In [None]:
model = pickle.load(open('/content/drive/MyDrive/NLP/w2v_model_gensim.p','rb'))

In [2]:
model = pickle.load(open('/content/drive/MyDrive/NLP/glove-wiki-gigaword-300-2.p','rb'))

In [None]:
weights = torch.FloatTensor(model.wv.vectors)

  """Entry point for launching an IPython kernel.


In [None]:
embedding = nn.Embedding.from_pretrained(weights)
# embedding.requires_grad = False

In [None]:
torch.stack([embedding(torch.tensor(model.vocab['computer'].index)),embedding(torch.tensor(model.vocab['computer'].index))]).size()

In [None]:
embedding(torch.tensor(model.vocab['computer'].index)) == torch.tensor(model['computer'])

In [None]:
model.key_to_index[',']
#model.vocab['computer']

1

In [None]:
torch.tensor(model.key_to_index['computer'])

tensor(951)

In [3]:
#random.shuffle(train)
vector_size = 300
maxlen = 400
x = torch.zeros((25000,1,maxlen))
y = []
for i in tqdm(range(25000)):
  doc = tokenize.word_tokenize(train[i]['text'])
  fil_doc_w2index = []
  for word in doc:
    try:
      fil_doc_w2index.append(model.key_to_index[word])
    except Exception:
      pass
  if len(fil_doc_w2index)<=maxlen:
    x[i] = torch.cat((torch.LongTensor(fil_doc_w2index),torch.zeros(maxlen-len(fil_doc_w2index)))).unsqueeze(0)
  else:
    x[i] = torch.LongTensor(fil_doc_w2index[-maxlen-1:-1]).unsqueeze(0)
  y.append(torch.tensor(int(train[i]['sentiment']=='pos')))
train_dataset = MyDataset(x,y)


x = torch.zeros((25000,1,maxlen))
y = []
for i in tqdm(range(25000)):
  doc = tokenize.word_tokenize(test[i]['text'])
  fil_doc_w2index = []
  for word in doc:
    try:
      fil_doc_w2index.append(model.key_to_index[word])
    except Exception:
      pass
  if len(fil_doc_w2index)<=maxlen:
    x[i] = torch.cat((torch.LongTensor(fil_doc_w2index),torch.zeros(maxlen-len(fil_doc_w2index)))).unsqueeze(0)
  else:
    x[i] = torch.LongTensor(fil_doc_w2index[-maxlen-1:-1]).unsqueeze(0)
  y.append(torch.tensor(int(test[i]['sentiment']=='pos')))
val_dataset = MyDataset(x,y)

100%|██████████| 25000/25000 [00:47<00:00, 528.75it/s]
100%|██████████| 25000/25000 [00:42<00:00, 584.79it/s]


In [4]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=50, shuffle=True, num_workers=2)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=50, shuffle=True, num_workers=2)

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        weights = torch.FloatTensor(model.wv.vectors)
        self.embedding = nn.Embedding.from_pretrained(weights)
        self.conv1 = nn.Conv2d(in_channels = 1,
                                out_channels = 100,
                                kernel_size = (2,vector_size))
        self.conv2 = nn.Conv2d(in_channels = 1,
                                out_channels = 100,
                                kernel_size = (3,vector_size))
        self.conv3 = nn.Conv2d(in_channels = 1,
                                out_channels = 100,
                                kernel_size = (4,vector_size))
        self.conv4 = nn.Conv2d(in_channels = 1,
                                out_channels = 100,
                                kernel_size = (5,vector_size))
        self.fc1 = nn.Linear(400, 2)
        self.dropout = nn.Dropout(0.30)
        self.dropout1 = nn.Dropout(0.30)
        self.dropout2 = nn.Dropout(0.30)
        self.dropout3 = nn.Dropout(0.30)
        self.dropout4 = nn.Dropout(0.30)
        self.dropout5 = nn.Dropout(0.30)
    def forward(self, x):
        x = self.dropout(self.embedding(x))
        x1 = F.relu(self.conv1(x))
        x1 = self.dropout1(F.max_pool2d(x1, (x1.size()[-2],1)))
        x2 = F.relu(self.conv2(x))
        x2 = self.dropout2(F.max_pool2d(x2, (x2.size()[-2],1)))
        x3 = F.relu(self.conv3(x))
        x3 = self.dropout3(F.max_pool2d(x3, (x3.size()[-2],1)))
        x4 = F.relu(self.conv4(x))
        x4 = self.dropout4(F.max_pool2d(x4, (x4.size()[-2],1)))
        x = torch.flatten(torch.cat((x1,x2,x3,x4),-3),start_dim=-3)
        x = self.fc1(self.dropout5(x))
        return x

net = Net()
net = net.to(device)
criterion = nn.CrossEntropyLoss()

  


In [None]:
ind = torch.stack([torch.reshape(torch.stack([torch.tensor(model.key_to_index['computer'])]*200),(1,200))]*100)
embedding(ind).size()

torch.Size([100, 1, 200, 300])

In [None]:
ind.size()

torch.Size([100, 1, 200])

In [6]:
#optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.5)
optimizer = optim.Adam(net.parameters(), lr=0.0001)
net.embedding.weight.requires_grad = False
net = net.float()
net.train()
live_accuracy = Accuracy()

for epoch in range(100):  # loop over the dataset multiple times
    print("\nStarting epoch {}".format(epoch+1))

    live_accuracy.reset()
    total = 0
    running_loss = 0.0

    # to make a beautiful progress bar
    loader = tqdm(enumerate(train_loader), total=len(train_loader))
    for i, data in loader:
        # get the data points
        inputs, labels = data
        inputs, labels = inputs.to(device).long(), labels.to(device)
        # zero the parameter gradients (else, they are accumulated)
        optimizer.zero_grad()

        # forward the data through the network
        outputs = net(inputs)
        # calculate the loss given the output of the network and the target labels
        loss = criterion(outputs, labels)
        # calculate the gradients of the network w.r.t. its parameters
        loss.backward()
        # Let the optimiser take an optimization step using the calculated gradients
        optimizer.step()
        
        running_loss += loss
        total += outputs.size(0)

        live_accuracy.update(outputs, labels)
        loader.set_description("loss: {:.5f}|acc: {:.2f}%".format(running_loss/total,100 * live_accuracy.compute()))
    eval_accu()
    net.train()

print('Finished Training')


Starting epoch 1


loss: 0.01325|acc: 59.74%: 100%|██████████| 500/500 [00:21<00:00, 23.72it/s]



Testing Accuracy: 81.02%

Starting epoch 2


loss: 0.00980|acc: 77.65%: 100%|██████████| 500/500 [00:15<00:00, 33.24it/s]



Testing Accuracy: 83.52%

Starting epoch 3


loss: 0.00814|acc: 81.81%: 100%|██████████| 500/500 [00:15<00:00, 32.36it/s]



Testing Accuracy: 84.47%

Starting epoch 4


loss: 0.00750|acc: 83.40%: 100%|██████████| 500/500 [00:15<00:00, 31.78it/s]



Testing Accuracy: 85.74%

Starting epoch 5


loss: 0.00710|acc: 84.32%: 100%|██████████| 500/500 [00:15<00:00, 31.91it/s]



Testing Accuracy: 86.20%

Starting epoch 6


loss: 0.00675|acc: 85.41%: 100%|██████████| 500/500 [00:15<00:00, 32.43it/s]



Testing Accuracy: 86.60%

Starting epoch 7


loss: 0.00651|acc: 86.00%: 100%|██████████| 500/500 [00:15<00:00, 32.18it/s]



Testing Accuracy: 86.84%

Starting epoch 8


loss: 0.00630|acc: 86.48%: 100%|██████████| 500/500 [00:15<00:00, 31.56it/s]



Testing Accuracy: 87.10%

Starting epoch 9


loss: 0.00610|acc: 87.09%: 100%|██████████| 500/500 [00:15<00:00, 32.11it/s]



Testing Accuracy: 87.00%

Starting epoch 10


loss: 0.00595|acc: 87.64%: 100%|██████████| 500/500 [00:15<00:00, 32.09it/s]



Testing Accuracy: 87.68%

Starting epoch 11


loss: 0.00570|acc: 88.11%: 100%|██████████| 500/500 [00:15<00:00, 32.31it/s]



Testing Accuracy: 87.88%

Starting epoch 12


loss: 0.00552|acc: 88.46%: 100%|██████████| 500/500 [00:15<00:00, 32.11it/s]



Testing Accuracy: 88.19%

Starting epoch 13


loss: 0.00541|acc: 88.82%: 100%|██████████| 500/500 [00:15<00:00, 32.07it/s]



Testing Accuracy: 88.26%

Starting epoch 14


loss: 0.00522|acc: 89.30%: 100%|██████████| 500/500 [00:15<00:00, 32.04it/s]



Testing Accuracy: 88.43%

Starting epoch 15


loss: 0.00513|acc: 89.52%: 100%|██████████| 500/500 [00:16<00:00, 30.79it/s]



Testing Accuracy: 88.54%

Starting epoch 16


loss: 0.00506|acc: 89.59%: 100%|██████████| 500/500 [00:15<00:00, 31.83it/s]



Testing Accuracy: 87.99%

Starting epoch 17


loss: 0.00481|acc: 90.42%: 100%|██████████| 500/500 [00:15<00:00, 31.81it/s]



Testing Accuracy: 88.80%

Starting epoch 18


loss: 0.00467|acc: 90.44%:  26%|██▋       | 132/500 [00:04<00:11, 32.33it/s]


KeyboardInterrupt: ignored

In [7]:
pickle.dump(net,open('/content/drive/MyDrive/NLP/imdb_sentiment2.p','wb'))

In [None]:
net = pickle.load(open('/content/drive/MyDrive/NLP/imdb_sentiment2.p','rb'))

In [8]:
net.eval()
accuracy = Accuracy()
accuracy.reset()
# Gradients are calculated on the forward pass for every iteration.
# As we do not need gradients now, we can disable the calculation.
with torch.no_grad():
    for data in tqdm(train_loader):
        # get the data points
        inputs, labels = data
        inputs, labels = inputs.to(device).long(), labels.to(device)
        # forward the data through the network
        outputs = net(inputs)
        
        accuracy.update(outputs, labels)

print("Training Accuracy: {:.2f}%".format(100 * accuracy.compute()))

100%|██████████| 500/500 [00:08<00:00, 57.38it/s]

Training Accuracy: 94.01%





In [9]:
net.eval()
accuracy = Accuracy()
accuracy.reset()        
with torch.no_grad():
    for data in tqdm(val_loader):
        # get the data points
        inputs, labels = data
        inputs, labels = inputs.to(device).long(), labels.to(device)
        # forward the data through the network
        outputs = net(inputs)
        
        accuracy.update(outputs, labels)
        
print("\nTesting Accuracy: {:.2f}%".format(100 * accuracy.compute()))

100%|██████████| 500/500 [00:08<00:00, 57.28it/s]


Testing Accuracy: 88.77%



