<a href="https://colab.research.google.com/github/gmihaila/machine_learning_things/blob/master/learning_pytorch/pytorch_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### SImple NN

1 hiddent layer NN

#### Initialize NN

In [2]:
import torch

n_input, n_hidden, n_output = 5, 3, 1

## initialize tensor for inputs, and outputs 
x = torch.randn((1, n_input))
y = torch.rand((1,n_output)) 


print(x.size())
print(y.size())
print()

## initialize tensor variables for weights 
w1 = torch.rand((n_input, n_hidden))
w2 = torch.rand((n_hidden, n_output))

print(w1.size())
print(w2.size())
print()

## initialize tensor variables for bias terms 
b1 = torch.rand((1,n_hidden))
b2 = torch.rand((1,n_output))

print(b1.size())
print(b2.size())
print()



torch.Size([1, 5])
torch.Size([1, 1])

torch.Size([5, 3])
torch.Size([3, 1])

torch.Size([1, 3])
torch.Size([1, 1])



#### Forward Pass

1. Forward Propagation
2. Loss computation
3. Backpropagation
4. Updating the parameters

In [3]:
## sigmoid activation function using pytorch
def sigmoid_activation(z):
  return 1 / (1 + torch.exp(-z))

## activation of hidden layer 
z1 = torch.mm(X,w1) + b1
a1 = sigmoid_activation(z1)

print(z1)
print(a1)
print()

## activation (output) of final layer 
z2 = torch.mm(a1, w2) + b2
a2 = output = sigmoid_activation(z2)

print(z2)
print(output)
print()

loss = y - output

print(loss)

tensor([[ 0.2044, -0.7169, -0.5639]])
tensor([[0.5509, 0.3281, 0.3626]])

tensor([[1.1906]])
tensor([[0.7669]])

tensor([[0.0322]])


#### Backprop

* loss gets multiplied by weights to penalize more of the bad weights
* some weights contirbute more to the output. If the error is large, their loss will be more

In [4]:
## function to calculate the derivative of activation
def sigmoid_delta(x):
  return x * (1 - x)

## compute derivative of error terms
delta_output = sigmoid_delta(output)
delta_hidden = sigmoid_delta(a1)

print(delta_output)
print(delta_hidden)
print()


## backpass the changes to previous layers 
d_output = loss * delta_output
loss_h = torch.mm(d_output, w2.t())
d_hidden = loss_h * delta_hidden

print(d_output)
print(loss_h)
print(d_hidden)

tensor([[0.1788]])
tensor([[0.2474, 0.2204, 0.2311]])

tensor([[0.0058]])
tensor([[0.0041, 0.0046, 0.0009]])
tensor([[0.0010, 0.0010, 0.0002]])


#### Update Parameters

In [5]:
learning_rate = 0.1

w2 += torch.mm(a1.t(), d_output) * learning_rate
w1 += torch.mm(X.t(), d_hidden) * learning_rate


print(w2)
print(w1)
print()

b1 += d_output.sum() * learning_rate
b2 += d_hidden.sum() * learning_rate

print(b1)
print(b2)

tensor([[0.7070],
        [0.7974],
        [0.1501]])
tensor([[0.4808, 0.0862, 0.6435],
        [0.8699, 0.2561, 0.4573],
        [0.7798, 0.8161, 0.8716],
        [0.0207, 0.8421, 0.4006],
        [0.4842, 0.7205, 0.5377]])

tensor([[0.1041, 0.7739, 0.8514]])
tensor([[0.4856]])


### MNIST Data Loader

In [6]:
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np

# transform the raw dataset into tensors and normalize them in a fixed range
_tasks = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,), (0.5,))]) #pass mean 0.5 and std 0.5

## Load MNIST Dataset and apply transformations
mnist = MNIST("data", download=True, train=True, transform=_tasks)

## create training and validation split 
split = int(0.8 * len(mnist))
index_list = list(range(len(mnist)))
train_idx, valid_idx = index_list[:split], index_list[split:]

## create sampler objects using SubsetRandomSampler
tr_sampler = SubsetRandomSampler(train_idx)
val_sampler = SubsetRandomSampler(valid_idx)

## create iterator objects for train and valid datasets
trainloader = DataLoader(mnist, batch_size=256, sampler=tr_sampler)
validloader = DataLoader(mnist, batch_size=256, sampler=val_sampler)


for data, label in trainloader:
  print(np.shape(data))
  # Flatten MNIST images into a 784 long vector
  # data = data.view(data.shape[0], -1)
  # print(data.shape)

  data = torch.flatten(data, start_dim=1)
  print(data.shape)
  break

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to data/MNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Extracting data/MNIST/raw/train-images-idx3-ubyte.gz to data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to data/MNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Extracting data/MNIST/raw/train-labels-idx1-ubyte.gz to data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to data/MNIST/raw/t10k-images-idx3-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Extracting data/MNIST/raw/t10k-images-idx3-ubyte.gz to data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to data/MNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Extracting data/MNIST/raw/t10k-labels-idx1-ubyte.gz to data/MNIST/raw
Processing...
Done!
torch.Size([256, 1, 28, 28])
torch.Size([256, 784])


### MNIST - NN

In [7]:
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np

# transform the raw dataset into tensors and normalize them in a fixed range
_tasks = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,), (0.5,))])

## Load MNIST Dataset and apply transformations
mnist = MNIST("data", download=True, train=True, transform=_tasks)

## create training and validation split 
split = int(0.8 * len(mnist))
index_list = list(range(len(mnist)))
train_idx, valid_idx = index_list[:split], index_list[split:]

## create sampler objects using SubsetRandomSampler
tr_sampler = SubsetRandomSampler(train_idx)
val_sampler = SubsetRandomSampler(valid_idx)

## create iterator objects for train and valid datasets
trainloader = DataLoader(mnist, batch_size=256, sampler=tr_sampler)
validloader = DataLoader(mnist, batch_size=256, sampler=val_sampler)

## Build class of model
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(784, 128)
        self.output = nn.Linear(128, 10)

    def forward(self, x):
        x = self.hidden(x)
        x = torch.sigmoid(x)
        x = self.output(x)
        return x

model = Model()

loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-6, momentum=0.9, nesterov=True)

for epoch in range(1,11):

  train_loss, valid_loss = [], []
  model.train() # activates training mod

  ## Training on 1 epoch
  for data, target in trainloader:

    data = torch.flatten(data, start_dim=1)

    optimizer.zero_grad() #clears gradients of all optimized classes

    ## forward pass
    output = model(data)

    ## loss calc
    loss = loss_function(output, target)

    ## backeard propagation
    loss.backward()

    ## weight optimization
    optimizer.step() #performs a single optimization step
    train_loss.append(loss.item())

  ### Evaluation on 1 epoch
  for data, target in validloader:

    data = torch.flatten(data, start_dim=1)

    output = model(data)
    loss = loss_function(output, target)
    valid_loss.append(loss.item())
  
  print("Epoch:", epoch, "Training Loss: ", np.mean(train_loss), "Valid Loss: ", np.mean(valid_loss))

Epoch: 1 Training Loss:  1.4182602132254458 Valid Loss:  0.699055218950231
Epoch: 2 Training Loss:  0.57324547938844 Valid Loss:  0.44476315315733567
Epoch: 3 Training Loss:  0.4311267040828441 Valid Loss:  0.371635469350409
Epoch: 4 Training Loss:  0.3762281567967953 Valid Loss:  0.33606003510191085
Epoch: 5 Training Loss:  0.3455755773218388 Valid Loss:  0.31383199799568096
Epoch: 6 Training Loss:  0.32484242605402114 Valid Loss:  0.29730967574931205
Epoch: 7 Training Loss:  0.3086457291340574 Valid Loss:  0.28762583656513946
Epoch: 8 Training Loss:  0.29576287418603897 Valid Loss:  0.2751726790311489
Epoch: 9 Training Loss:  0.2848933809139627 Valid Loss:  0.26707785687548047
Epoch: 10 Training Loss:  0.2744598998985392 Valid Loss:  0.2603118803272856


#### Evaluation

In [8]:
## dataloader for validation dataset 
dataiter = iter(validloader)
data, labels = dataiter.next()
data = torch.flatten(data, start_dim=1)
output = model(data)

print(output.shape)
print(output[0])

_, pred_tensor = torch.max(output, 1)

print(pred_tensor.shape)
print(pred_tensor[0])

preds = np.squeeze(pred_tensor.numpy())

print("Actual: ", labels[:10])
print("Predic: ", preds[:10])

torch.Size([256, 10])
tensor([-1.2229, -2.1489,  0.5885,  1.7478, -1.7719, -0.4553, -5.7354,  8.9517,
        -1.6335,  2.9513], grad_fn=<SelectBackward>)
torch.Size([256])
tensor(7)
Actual:  tensor([7, 1, 1, 3, 1, 8, 0, 0, 2, 7])
Predic:  [7 1 1 3 1 3 0 0 2 7]


### MNIST - NN [1 GPU]

In [9]:
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np
from torch.backends import cudnn

cudnn.benchmark = True

# transform the raw dataset into tensors and normalize them in a fixed range
_tasks = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,), (0.5,))])

## Load MNIST Dataset and apply transformations
mnist = MNIST("data", download=True, train=True, transform=_tasks)

## create training and validation split 
split = int(0.8 * len(mnist))
index_list = list(range(len(mnist)))
train_idx, valid_idx = index_list[:split], index_list[split:]

## create sampler objects using SubsetRandomSampler
tr_sampler = SubsetRandomSampler(train_idx)
val_sampler = SubsetRandomSampler(valid_idx)

## create iterator objects for train and valid datasets
trainloader = DataLoader(mnist, batch_size=256, sampler=tr_sampler, num_workers=2)
validloader = DataLoader(mnist, batch_size=256, sampler=val_sampler, num_workers=2)

## GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Build class of model
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(784, 128)
        self.output = nn.Linear(128, 10)

    def forward(self, x):
        x = self.hidden(x)
        x = torch.sigmoid(x)
        x = self.output(x)
        return x

model = Model()

model.to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-6, momentum=0.9, nesterov=True)

for epoch in range(1,11):

  train_loss, valid_loss = [], []
  model.train() # activates training mod

  ## Training on 1 epoch
  for data, target in trainloader:

    data = torch.flatten(data.to(device), start_dim=1)

    optimizer.zero_grad() #clears gradients of all optimized classes

    ## forward pass
    output = model(data.to(device))

    ## loss calc
    loss = loss_function(output.to(device), target.to(device))

    ## backeard propagation
    loss.backward()

    ## weight optimization
    optimizer.step() #performs a single optimization step
    train_loss.append(loss.item())

  ### Evaluation on 1 epoch
  for data, target in validloader:

    data = torch.flatten(data, start_dim=1)

    output = model(data.to(device))
    loss = loss_function(output.to(device), target.to(device))
    valid_loss.append(loss.item())
  
  print("Epoch:", epoch, "Training Loss: ", np.mean(train_loss), "Valid Loss: ", np.mean(valid_loss))

Epoch: 1 Training Loss:  1.4097135390372986 Valid Loss:  0.6944737827524226
Epoch: 2 Training Loss:  0.5700033706553439 Valid Loss:  0.4423278899902993
Epoch: 3 Training Loss:  0.4285301704038965 Valid Loss:  0.36927041284581447
Epoch: 4 Training Loss:  0.3743018922495081 Valid Loss:  0.3350196144682296
Epoch: 5 Training Loss:  0.34516107933001317 Valid Loss:  0.31221806527452267
Epoch: 6 Training Loss:  0.32385584664471606 Valid Loss:  0.29647792653834565
Epoch: 7 Training Loss:  0.307615090003039 Valid Loss:  0.28435816156103255
Epoch: 8 Training Loss:  0.29477079268148604 Valid Loss:  0.27344274330646434
Epoch: 9 Training Loss:  0.2839543028397763 Valid Loss:  0.2663304758198718
Epoch: 10 Training Loss:  0.2738433091088812 Valid Loss:  0.25762698815224017


#### Evaluation

### MNIST - NN [Multy GPU, Core]

Specify certain GPUs

In [10]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2" # number of gpu devices
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np
from torch.backends import cudnn
import multiprocessing

cudnn.benchmark = True

n_cores = multiprocessing.cpu_count()

# transform the raw dataset into tensors and normalize them in a fixed range
_tasks = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,), (0.5,))])

## Load MNIST Dataset and apply transformations
mnist = MNIST("data", download=True, train=True, transform=_tasks)

## create training and validation split 
split = int(0.8 * len(mnist))
index_list = list(range(len(mnist)))
train_idx, valid_idx = index_list[:split], index_list[split:]

## create sampler objects using SubsetRandomSampler
tr_sampler = SubsetRandomSampler(train_idx)
val_sampler = SubsetRandomSampler(valid_idx)

## create iterator objects for train and valid datasets
trainloader = DataLoader(mnist, batch_size=256, sampler=tr_sampler, num_workers=n_cores)
validloader = DataLoader(mnist, batch_size=256, sampler=val_sampler, num_workers=n_cores)

## GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Build class of model
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(784, 128)
        self.output = nn.Linear(128, 10)

    def forward(self, x):
        x = self.hidden(x)
        x = torch.sigmoid(x)
        x = self.output(x)
        return x

model = Model()

## Multi GPU
if torch.cuda.device_count() > 1:
  print("We can use", torch.cuda.device_count(), "GPUs")
  model = nn.DataParallel(model, device_ids=[1]) # device_ids=[0,1,2] depending on the # of gpus

model.to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-6, momentum=0.9, nesterov=True)

for epoch in range(1,11):

  train_loss, valid_loss = [], []
  model.train() # activates training mod

  ## Training on 1 epoch
  for data, target in trainloader:

    data = torch.flatten(data.to(device), start_dim=1)

    optimizer.zero_grad() #clears gradients of all optimized classes

    ## forward pass
    output = model(data.to(device))

    ## loss calc
    loss = loss_function(output.to(device), target.to(device))

    ## backeard propagation
    loss.backward()

    ## weight optimization
    optimizer.step() #performs a single optimization step
    train_loss.append(loss.item())

  ### Evaluation on 1 epoch
  for data, target in validloader:

    data = torch.flatten(data, start_dim=1)

    output = model(data.to(device))
    loss = loss_function(output.to(device), target.to(device))
    valid_loss.append(loss.item())
  
  print("Epoch:", epoch, "Training Loss: ", np.mean(train_loss), "Valid Loss: ", np.mean(valid_loss))

Epoch: 1 Training Loss:  1.4160236174121816 Valid Loss:  0.7035221787209206
Epoch: 2 Training Loss:  0.5761646563385395 Valid Loss:  0.4498079330363172
Epoch: 3 Training Loss:  0.4343532992804304 Valid Loss:  0.37484663344444114
Epoch: 4 Training Loss:  0.3792155794006713 Valid Loss:  0.3402580029152809
Epoch: 5 Training Loss:  0.3478186359589404 Valid Loss:  0.31523533323977854
Epoch: 6 Training Loss:  0.3260815628665559 Valid Loss:  0.300622931820281
Epoch: 7 Training Loss:  0.3104622558234854 Valid Loss:  0.28752252301003073
Epoch: 8 Training Loss:  0.2970064908583113 Valid Loss:  0.27725158853733795
Epoch: 9 Training Loss:  0.28619012426822743 Valid Loss:  0.2696801703026954
Epoch: 10 Training Loss:  0.2766001538393345 Valid Loss:  0.2605013834669235


#### Evaluation

### MNIST CNN [Multy GPU, Core]

In [11]:
import os
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torch.backends import cudnn
import numpy as np
import multiprocessing
from sklearn.metrics import accuracy_score

cudnn.benchmark = True

num_cores = multiprocessing.cpu_count()

# transform the raw dataset into tensors and normalize them in a fixed range
_tasks = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

## Load MNIST Dataset and apply transformations
mnist = MNIST("data", download=True, train=True, transform=_tasks)

## create training and validation split 
split_train = int(0.7 * len(mnist))
split_valid = split_train + int(0.1 * len(mnist))
index_list = list(range(len(mnist)))
train_idx, valid_idx, test_idx = index_list[:split_train], index_list[split_train:split_valid], index_list[split_valid:]

## create sampler objects using SubsetRandomSampler
tr_sampler = SubsetRandomSampler(train_idx)
val_sampler = SubsetRandomSampler(valid_idx)
tes_sampler = SubsetRandomSampler(test_idx)

## create iterator objects for train and valid datasets
trainloader = DataLoader(mnist, batch_size=256, sampler=tr_sampler, num_workers=num_cores)
validloader = DataLoader(mnist, batch_size=256, sampler=val_sampler, num_workers=num_cores)
testloader = DataLoader(mnist, batch_size=10, sampler=tes_sampler, num_workers=num_cores)

## GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Build class of model
class Model(nn.Module):
  def __init__(self):
    super(Model, self).__init__()
    
    ## define layers
    self.conv1 = nn.Conv2d(1, 16, 3, padding=1)
    self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
    self.conv3 = nn.Conv2d(32, 64, 3, padding=1)
    self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
    self.linear1 = nn.Linear(64*3*3, 512)
    self.linear2 = nn.Linear(512,10)
    
    return

  def forward(self, x):
    x = self.pool(F.relu(self.conv1(x)))
    x = self.pool(F.relu(self.conv2(x)))
    x = self.pool(F.relu(self.conv3(x)))
    x = x.view(-1,64*3*3) #torch.flatten(x, start_dim=1) ## reshaping
    x = F.relu(self.linear1(x))
    x = self.linear2(x)

    return x

## create model
model = Model()

## in case of multi gpu
if torch.cuda.device_count() > 1:
  print("Using", torch.cuda.device_count(), "GPUs")
  model = nn.DataParallel(model, device_ids=[1]) # [0,1,2,3]

## put model on gpu
model.to(device)

## loss fucntion
loss_function = nn.CrossEntropyLoss()
## optimizer
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-6, momentum=0.9, nesterov=True)
## run for n epochs
for epoch in range(1,11):
  train_loss , valid_loss = [], []

  ## train part
  model.train()
  for data, target in trainloader:
    ## gradients acumulate. need to clear them on each example
    optimizer.zero_grad()
    output = model(data.to(device))
    loss = loss_function(output.to(device), target.to(device))
    loss.backward()
    optimizer.step()
    train_loss.append(loss.item())

  ## evaluation part on validation
  model.eval() ##set model in evaluation mode
  for data, target in validloader:
    output = model(data.to(device))
    loss = loss_function(output.to(device), target.to(device))
    valid_loss.append(loss.item())

  print("Epoch:", epoch, "Training Loss: ", np.mean(train_loss), "Valid Loss: ", np.mean(valid_loss))

Epoch: 1 Training Loss:  1.4049706922336058 Valid Loss:  0.7877426892518997
Epoch: 2 Training Loss:  0.1534272958834966 Valid Loss:  0.1276818998157978
Epoch: 3 Training Loss:  0.08695878804181562 Valid Loss:  0.13764395285397768
Epoch: 4 Training Loss:  0.06591612339922875 Valid Loss:  0.0927470267439882
Epoch: 5 Training Loss:  0.05474590759611491 Valid Loss:  0.07778087941308816
Epoch: 6 Training Loss:  0.046875652998234284 Valid Loss:  0.07238113041967154
Epoch: 7 Training Loss:  0.04052448337051001 Valid Loss:  0.06849378650076687
Epoch: 8 Training Loss:  0.03585321864846981 Valid Loss:  0.0661202745201687
Epoch: 9 Training Loss:  0.03236920830201019 Valid Loss:  0.05829143252534171
Epoch: 10 Training Loss:  0.029536526610679698 Valid Loss:  0.05339367943815887


#### Evaluation

In [12]:
model.eval()

y_pred, y_true = [], []

for data, target in testloader:
  predicted = model(data.to(device))
  _, predicted = torch.max(predicted.cpu(), 1)
  y_pred += predicted.tolist()
  y_true += target.tolist()

print("Accuracy: ", accuracy_score(y_pred, y_true))

Accuracy:  0.9875833333333334


### Sentiment Classification

#### Download Data

In [13]:
from IPython.display import clear_output

!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
clear_output(wait=True)

!gunzip aclImdb_v1.tar.gz
clear_output(wait=True)

!tar -xvf aclImdb_v1.tar
clear_output(wait=True)

!ls

aclImdb  aclImdb_v1.tar  data  sample_data


#### Import

In [14]:
## Load TF 2.0
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
  
import os
from os import listdir
from os.path import isfile, join
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler
from torch.backends import cudnn
import tensorflow as tf
import numpy as np
import multiprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

num_cores = multiprocessing.cpu_count()

TensorFlow 2.x selected.


Using TensorFlow backend.


#### Data Loader

In [0]:
train_pos_path = "/content/aclImdb/train/pos/"
train_neg_path = "/content/aclImdb/train/neg/"

test_pos_path = "/content/aclImdb/test/pos/"
test_neg_path = "/content/aclImdb/test/neg/"



In [0]:
class ImdbMovieDataset(Dataset):

  def __init__(self, pos_path, neg_path, maxlen=100, text_tokenizer=None):
    """
    Args:

    """
    self.pos_path = pos_path
    self.neg_path = neg_path
    self.pos_files = [file_name[:-4] for file_name in self.get_files(self.pos_path)]
    self.neg_files = [file_name[:-4] for file_name in self.get_files(self.neg_path)]
    self.n_pos_files = len(self.pos_files)
    self.n_neg_files = len(self.neg_files)
    self.maxlen = maxlen
    if text_tokenizer:
      self.text_tokenizer = text_tokenizer
    else:
      self.text_tokenizer = self.fit_tokenizer()
    return


  def __len__(self):
    return (self.n_pos_files + self.n_neg_files)


  def __getitem__(self, idx):
    if idx < self.n_pos_files:
      ## positive review
      path = self.pos_path + self.pos_files[idx]
      y = 1
    else:
      ## negative review
      path = self.neg_path + self.neg_files[idx - self.n_pos_files]
      y = 0
    review = self.read_file(path)
    X = self.text_tokenizer.texts_to_sequences([review])
    X = pad_sequences(sequences=X, 
                      maxlen=self.maxlen, 
                      padding='post', 
                      truncating='post')[0]

    return torch.tensor(X), torch.tensor(y)

  def get_files(self, path):
    return [f for f in listdir(path) if isfile(join(path, f))]

  def read_file(self, path):
    with open(path + '.txt', 'r') as raw_file:
      content_file = raw_file.read()
    return content_file

  def fit_tokenizer(self):
    tmp_tokenizer = Tokenizer(num_words=None,
                      lower=True,
                      oov_token='<UNK>')
    print("Positive file fit Tokenizer")
    for pos_file in self.pos_files[:50]:
      tmp_tokenizer.fit_on_texts([self.read_file(self.pos_path + pos_file)])

    print("Negative file fit Tokenizer")
    for neg_file in self.neg_files[:50]:
      tmp_tokenizer.fit_on_texts([self.read_file(self.neg_path + neg_file)])
    return tmp_tokenizer

In [17]:
## data generator parameters
data_generator_parameters = {'batch_size': 64,
                             'shuffle': True,
                             'num_workers': num_cores}

training_set = ImdbMovieDataset(pos_path=train_pos_path,
                              neg_path=train_neg_path)

Positive file fit Tokenizer
Negative file fit Tokenizer


In [18]:
training_generator = DataLoader(training_set, **data_generator_parameters)

for local_batch, local_label in training_generator:
  print(local_batch, local_label)
  print(np.shape(local_label))
  break

tensor([[ 458,    8,    4,  ...,   73,  277,    1],
        [  45,   26,   29,  ...,   78, 4510,   26],
        [  48,   22,   46,  ...,    6,   25,    1],
        ...,
        [ 245,    1,    1,  ...,    1,   23,    2],
        [  10,  304,   13,  ..., 1976,  959,   41],
        [   1,   10,   54,  ...,   70,    9, 1124]], dtype=torch.int32) tensor([1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
        1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
        0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1])
torch.Size([64])


In [19]:
torch.tensor(training_set.text_tokenizer.texts_to_sequences(["this is me, not you"]), dtype=torch.long)

tensor([[13,  8, 70, 22, 26]])