<a href="https://colab.research.google.com/github/gmihaila/machine_learning_things/blob/master/learning_pytorch/pytorch_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### SImple NN

1 hiddent layer NN

#### Initialize NN

In [1]:
import torch

n_input, n_hidden, n_output = 5, 3, 1

## initialize tensor for inputs, and outputs 
x = torch.randn((1, n_input))
y = torch.rand((1,n_output)) 


print(x.size())
print(y.size())
print()

## initialize tensor variables for weights 
w1 = torch.rand((n_input, n_hidden))
w2 = torch.rand((n_hidden, n_output))

print(w1.size())
print(w2.size())
print()

## initialize tensor variables for bias terms 
b1 = torch.rand((1,n_hidden))
b2 = torch.rand((1,n_output))

print(b1.size())
print(b2.size())
print()



torch.Size([1, 5])
torch.Size([1, 1])

torch.Size([5, 3])
torch.Size([3, 1])

torch.Size([1, 3])
torch.Size([1, 1])



#### Forward Pass

1. Forward Propagation
2. Loss computation
3. Backpropagation
4. Updating the parameters

In [2]:
## sigmoid activation function using pytorch
def sigmoid_activation(z):
  return 1 / (1 + torch.exp(-z))

## activation of hidden layer 
z1 = torch.mm(x,w1) + b1
a1 = sigmoid_activation(z1)

print(z1)
print(a1)
print()

## activation (output) of final layer 
z2 = torch.mm(a1, w2) + b2
a2 = output = sigmoid_activation(z2)

print(z2)
print(output)
print()

loss = y - output

print(loss)

tensor([[ 3.5044,  3.5523, -0.8083]])
tensor([[0.9708, 0.9721, 0.3082]])

tensor([[1.3401]])
tensor([[0.7925]])

tensor([[-0.1339]])


#### Backprop

* loss gets multiplied by weights to penalize more of the bad weights
* some weights contirbute more to the output. If the error is large, their loss will be more

In [3]:
## function to calculate the derivative of activation
def sigmoid_delta(x):
  return x * (1 - x)

## compute derivative of error terms
delta_output = sigmoid_delta(output)
delta_hidden = sigmoid_delta(a1)

print(delta_output)
print(delta_hidden)
print()


## backpass the changes to previous layers 
d_output = loss * delta_output
loss_h = torch.mm(d_output, w2.t())
d_hidden = loss_h * delta_hidden

print(d_output)
print(loss_h)
print(d_hidden)

tensor([[0.1644]])
tensor([[0.0283, 0.0271, 0.2132]])

tensor([[-0.0220]])
tensor([[-0.0052, -0.0168, -0.0069]])
tensor([[-0.0001, -0.0005, -0.0015]])


#### Update Parameters

In [4]:
learning_rate = 0.1

w2 += torch.mm(a1.t(), d_output) * learning_rate
w1 += torch.mm(x.t(), d_hidden) * learning_rate


print(w2)
print(w1)
print()

b1 += d_output.sum() * learning_rate
b2 += d_hidden.sum() * learning_rate

print(b1)
print(b2)

tensor([[0.2323],
        [0.7628],
        [0.3141]])
tensor([[0.7888, 0.4538, 0.7354],
        [0.3648, 0.4805, 0.0080],
        [0.1933, 0.0877, 0.9228],
        [0.7555, 0.7120, 0.0322],
        [0.8335, 0.7132, 0.1058]])

tensor([[0.8232, 0.9470, 0.4273]])
tensor([[0.2717]])


### MNIST Data Loader

In [5]:
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np

# transform the raw dataset into tensors and normalize them in a fixed range
_tasks = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,), (0.5,))]) #pass mean 0.5 and std 0.5

## Load MNIST Dataset and apply transformations
mnist = MNIST("data", download=True, train=True, transform=_tasks)

## create training and validation split 
split = int(0.8 * len(mnist))
index_list = list(range(len(mnist)))
train_idx, valid_idx = index_list[:split], index_list[split:]

## create sampler objects using SubsetRandomSampler
tr_sampler = SubsetRandomSampler(train_idx)
val_sampler = SubsetRandomSampler(valid_idx)

## create iterator objects for train and valid datasets
trainloader = DataLoader(mnist, batch_size=256, sampler=tr_sampler)
validloader = DataLoader(mnist, batch_size=256, sampler=val_sampler)


for data, label in trainloader:
  print(np.shape(data))
  # Flatten MNIST images into a 784 long vector
  # data = data.view(data.shape[0], -1)
  # print(data.shape)

  data = torch.flatten(data, start_dim=1)
  print(data.shape)
  break

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to data/MNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting data/MNIST/raw/train-images-idx3-ubyte.gz to data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to data/MNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting data/MNIST/raw/train-labels-idx1-ubyte.gz to data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to data/MNIST/raw/t10k-images-idx3-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting data/MNIST/raw/t10k-images-idx3-ubyte.gz to data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to data/MNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting data/MNIST/raw/t10k-labels-idx1-ubyte.gz to data/MNIST/raw
Processing...
Done!


torch.Size([256, 1, 28, 28])
torch.Size([256, 784])


### MNIST - NN

In [6]:
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np

# transform the raw dataset into tensors and normalize them in a fixed range
_tasks = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,), (0.5,))])

## Load MNIST Dataset and apply transformations
mnist = MNIST("data", download=True, train=True, transform=_tasks)

## create training and validation split 
split = int(0.8 * len(mnist))
index_list = list(range(len(mnist)))
train_idx, valid_idx = index_list[:split], index_list[split:]

## create sampler objects using SubsetRandomSampler
tr_sampler = SubsetRandomSampler(train_idx)
val_sampler = SubsetRandomSampler(valid_idx)

## create iterator objects for train and valid datasets
trainloader = DataLoader(mnist, batch_size=256, sampler=tr_sampler)
validloader = DataLoader(mnist, batch_size=256, sampler=val_sampler)

## Build class of model
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(784, 128)
        self.output = nn.Linear(128, 10)

    def forward(self, x):
        x = self.hidden(x)
        x = torch.sigmoid(x)
        x = self.output(x)
        return x

model = Model()

loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-6, momentum=0.9, nesterov=True)

for epoch in range(1,11):

  train_loss, valid_loss = [], []
  model.train() # activates training mod

  ## Training on 1 epoch
  for data, target in trainloader:

    data = torch.flatten(data, start_dim=1)

    optimizer.zero_grad() #clears gradients of all optimized classes

    ## forward pass
    output = model(data)

    ## loss calc
    loss = loss_function(output, target)

    ## backeard propagation
    loss.backward()

    ## weight optimization
    optimizer.step() #performs a single optimization step
    train_loss.append(loss.item())

  ### Evaluation on 1 epoch
  for data, target in validloader:

    data = torch.flatten(data, start_dim=1)

    output = model(data)
    loss = loss_function(output, target)
    valid_loss.append(loss.item())
  
  print("Epoch:", epoch, "Training Loss: ", np.mean(train_loss), "Valid Loss: ", np.mean(valid_loss))



Epoch: 1 Training Loss:  1.4349780979942768 Valid Loss:  0.7062469999840919
Epoch: 2 Training Loss:  0.575546939163766 Valid Loss:  0.4442503724960571
Epoch: 3 Training Loss:  0.43013869321092646 Valid Loss:  0.3703078912927749
Epoch: 4 Training Loss:  0.37567793118192794 Valid Loss:  0.33534921578904414
Epoch: 5 Training Loss:  0.3451666477671329 Valid Loss:  0.3142616549070845
Epoch: 6 Training Loss:  0.3242472232656276 Valid Loss:  0.30002586036286455
Epoch: 7 Training Loss:  0.30901888607347267 Valid Loss:  0.2856987930358724
Epoch: 8 Training Loss:  0.2960701195642035 Valid Loss:  0.2761637606519334
Epoch: 9 Training Loss:  0.2850086708810735 Valid Loss:  0.26927014361036583
Epoch: 10 Training Loss:  0.27550541340036594 Valid Loss:  0.2590240886870851


#### Evaluation

In [7]:
## dataloader for validation dataset 
dataiter = iter(validloader)
data, labels = dataiter.next()
data = torch.flatten(data, start_dim=1)
output = model(data)

print(output.shape)
print(output[0])

_, pred_tensor = torch.max(output, 1)

print(pred_tensor.shape)
print(pred_tensor[0])

preds = np.squeeze(pred_tensor.numpy())

print("Actual: ", labels[:10])
print("Predic: ", preds[:10])

torch.Size([256, 10])
tensor([ 1.0437, -1.5937, -0.2724,  1.1894,  1.1126,  5.0138,  2.1918, -2.3319,
        -1.1587, -4.2133], grad_fn=<SelectBackward>)
torch.Size([256])
tensor(5)
Actual:  tensor([5, 1, 3, 1, 2, 6, 6, 7, 5, 6])
Predic:  [5 1 3 1 2 6 6 7 5 6]


### MNIST - NN [1 GPU]

In [8]:
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np
from torch.backends import cudnn

cudnn.benchmark = True

# transform the raw dataset into tensors and normalize them in a fixed range
_tasks = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,), (0.5,))])

## Load MNIST Dataset and apply transformations
mnist = MNIST("data", download=True, train=True, transform=_tasks)

## create training and validation split 
split = int(0.8 * len(mnist))
index_list = list(range(len(mnist)))
train_idx, valid_idx = index_list[:split], index_list[split:]

## create sampler objects using SubsetRandomSampler
tr_sampler = SubsetRandomSampler(train_idx)
val_sampler = SubsetRandomSampler(valid_idx)

## create iterator objects for train and valid datasets
trainloader = DataLoader(mnist, batch_size=256, sampler=tr_sampler, num_workers=2)
validloader = DataLoader(mnist, batch_size=256, sampler=val_sampler, num_workers=2)

## GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Build class of model
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(784, 128)
        self.output = nn.Linear(128, 10)

    def forward(self, x):
        x = self.hidden(x)
        x = torch.sigmoid(x)
        x = self.output(x)
        return x

model = Model()

model.to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-6, momentum=0.9, nesterov=True)

for epoch in range(1,11):

  train_loss, valid_loss = [], []
  model.train() # activates training mod

  ## Training on 1 epoch
  for data, target in trainloader:

    data = torch.flatten(data.to(device), start_dim=1)

    optimizer.zero_grad() #clears gradients of all optimized classes

    ## forward pass
    output = model(data.to(device))

    ## loss calc
    loss = loss_function(output.to(device), target.to(device))

    ## backeard propagation
    loss.backward()

    ## weight optimization
    optimizer.step() #performs a single optimization step
    train_loss.append(loss.item())

  ### Evaluation on 1 epoch
  for data, target in validloader:

    data = torch.flatten(data, start_dim=1)

    output = model(data.to(device))
    loss = loss_function(output.to(device), target.to(device))
    valid_loss.append(loss.item())
  
  print("Epoch:", epoch, "Training Loss: ", np.mean(train_loss), "Valid Loss: ", np.mean(valid_loss))

Epoch: 1 Training Loss:  1.4226015869607316 Valid Loss:  0.702143872037847
Epoch: 2 Training Loss:  0.5734938942688577 Valid Loss:  0.44547347756142314
Epoch: 3 Training Loss:  0.4304347908560266 Valid Loss:  0.3725986708985998
Epoch: 4 Training Loss:  0.3755895825142556 Valid Loss:  0.3367689743320993
Epoch: 5 Training Loss:  0.3452775955517241 Valid Loss:  0.3167601830147682
Epoch: 6 Training Loss:  0.3250410721657124 Valid Loss:  0.30036201534119056
Epoch: 7 Training Loss:  0.3101884127455823 Valid Loss:  0.2885698393938389
Epoch: 8 Training Loss:  0.29760462307232494 Valid Loss:  0.2776079479050129
Epoch: 9 Training Loss:  0.2871963589432392 Valid Loss:  0.269358513837165
Epoch: 10 Training Loss:  0.2774508301406465 Valid Loss:  0.26319165052251614


#### Evaluation

### MNIST - NN [Multy GPU, Core]

Specify certain GPUs

In [9]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2" # number of gpu devices
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np
from torch.backends import cudnn
import multiprocessing

cudnn.benchmark = True

n_cores = multiprocessing.cpu_count()

# transform the raw dataset into tensors and normalize them in a fixed range
_tasks = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,), (0.5,))])

## Load MNIST Dataset and apply transformations
mnist = MNIST("data", download=True, train=True, transform=_tasks)

## create training and validation split 
split = int(0.8 * len(mnist))
index_list = list(range(len(mnist)))
train_idx, valid_idx = index_list[:split], index_list[split:]

## create sampler objects using SubsetRandomSampler
tr_sampler = SubsetRandomSampler(train_idx)
val_sampler = SubsetRandomSampler(valid_idx)

## create iterator objects for train and valid datasets
trainloader = DataLoader(mnist, batch_size=256, sampler=tr_sampler, num_workers=n_cores)
validloader = DataLoader(mnist, batch_size=256, sampler=val_sampler, num_workers=n_cores)

## GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Build class of model
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(784, 128)
        self.output = nn.Linear(128, 10)

    def forward(self, x):
        x = self.hidden(x)
        x = torch.sigmoid(x)
        x = self.output(x)
        return x

model = Model()

## Multi GPU
if torch.cuda.device_count() > 1:
  print("We can use", torch.cuda.device_count(), "GPUs")
  model = nn.DataParallel(model, device_ids=[1]) # device_ids=[0,1,2] depending on the # of gpus

model.to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-6, momentum=0.9, nesterov=True)

for epoch in range(1,11):

  train_loss, valid_loss = [], []
  model.train() # activates training mod

  ## Training on 1 epoch
  for data, target in trainloader:

    data = torch.flatten(data.to(device), start_dim=1)

    optimizer.zero_grad() #clears gradients of all optimized classes

    ## forward pass
    output = model(data.to(device))

    ## loss calc
    loss = loss_function(output.to(device), target.to(device))

    ## backeard propagation
    loss.backward()

    ## weight optimization
    optimizer.step() #performs a single optimization step
    train_loss.append(loss.item())

  ### Evaluation on 1 epoch
  for data, target in validloader:

    data = torch.flatten(data, start_dim=1)

    output = model(data.to(device))
    loss = loss_function(output.to(device), target.to(device))
    valid_loss.append(loss.item())
  
  print("Epoch:", epoch, "Training Loss: ", np.mean(train_loss), "Valid Loss: ", np.mean(valid_loss))

Epoch: 1 Training Loss:  1.4196174848586955 Valid Loss:  0.7070348668605724
Epoch: 2 Training Loss:  0.5785721481797543 Valid Loss:  0.4495881415427999
Epoch: 3 Training Loss:  0.4331658460358356 Valid Loss:  0.37312279990378844
Epoch: 4 Training Loss:  0.37733632976070364 Valid Loss:  0.33723752802990853
Epoch: 5 Training Loss:  0.346332589521053 Valid Loss:  0.31461838109696166
Epoch: 6 Training Loss:  0.3250312273331145 Valid Loss:  0.297722431256416
Epoch: 7 Training Loss:  0.30921553867928525 Valid Loss:  0.28540977264972445
Epoch: 8 Training Loss:  0.29635619277015646 Valid Loss:  0.2772708123034619
Epoch: 9 Training Loss:  0.28546011772878627 Valid Loss:  0.2672363671850651
Epoch: 10 Training Loss:  0.2761305772719231 Valid Loss:  0.25905618927580243


#### Evaluation

### MNIST CNN [Multy GPU, Core]

In [10]:
import os
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torch.backends import cudnn
import numpy as np
import multiprocessing
from sklearn.metrics import accuracy_score

cudnn.benchmark = True

num_cores = multiprocessing.cpu_count()

# transform the raw dataset into tensors and normalize them in a fixed range
_tasks = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

## Load MNIST Dataset and apply transformations
mnist = MNIST("data", download=True, train=True, transform=_tasks)

## create training and validation split 
split_train = int(0.7 * len(mnist))
split_valid = split_train + int(0.1 * len(mnist))
index_list = list(range(len(mnist)))
train_idx, valid_idx, test_idx = index_list[:split_train], index_list[split_train:split_valid], index_list[split_valid:]

## create sampler objects using SubsetRandomSampler
tr_sampler = SubsetRandomSampler(train_idx)
val_sampler = SubsetRandomSampler(valid_idx)
tes_sampler = SubsetRandomSampler(test_idx)

## create iterator objects for train and valid datasets
trainloader = DataLoader(mnist, batch_size=256, sampler=tr_sampler, num_workers=num_cores)
validloader = DataLoader(mnist, batch_size=256, sampler=val_sampler, num_workers=num_cores)
testloader = DataLoader(mnist, batch_size=10, sampler=tes_sampler, num_workers=num_cores)

## GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Build class of model
class Model(nn.Module):
  def __init__(self):
    super(Model, self).__init__()
    
    ## define layers
    self.conv1 = nn.Conv2d(1, 16, 3, padding=1)
    self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
    self.conv3 = nn.Conv2d(32, 64, 3, padding=1)
    self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
    self.linear1 = nn.Linear(64*3*3, 512)
    self.linear2 = nn.Linear(512,10)
    
    return

  def forward(self, x):
    x = self.pool(F.relu(self.conv1(x)))
    x = self.pool(F.relu(self.conv2(x)))
    x = self.pool(F.relu(self.conv3(x)))
    x = x.view(-1,64*3*3) #torch.flatten(x, start_dim=1) ## reshaping
    x = F.relu(self.linear1(x))
    x = self.linear2(x)

    return x

## create model
model = Model()

## in case of multi gpu
if torch.cuda.device_count() > 1:
  print("Using", torch.cuda.device_count(), "GPUs")
  model = nn.DataParallel(model, device_ids=[1]) # [0,1,2,3]

## put model on gpu
model.to(device)

## loss fucntion
loss_function = nn.CrossEntropyLoss()
## optimizer
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-6, momentum=0.9, nesterov=True)
## run for n epochs
for epoch in range(1,11):
  train_loss , valid_loss = [], []

  ## train part
  model.train()
  for data, target in trainloader:
    ## gradients acumulate. need to clear them on each example
    optimizer.zero_grad()
    output = model(data.to(device))
    loss = loss_function(output.to(device), target.to(device))
    loss.backward()
    optimizer.step()
    train_loss.append(loss.item())

  ## evaluation part on validation
  model.eval() ##set model in evaluation mode
  for data, target in validloader:
    output = model(data.to(device))
    loss = loss_function(output.to(device), target.to(device))
    valid_loss.append(loss.item())

  print("Epoch:", epoch, "Training Loss: ", np.mean(train_loss), "Valid Loss: ", np.mean(valid_loss))

Epoch: 1 Training Loss:  1.43601565442302 Valid Loss:  0.596405832717816
Epoch: 2 Training Loss:  0.16294173417669355 Valid Loss:  0.138090871895353
Epoch: 3 Training Loss:  0.0923407681286335 Valid Loss:  0.11756174452602863
Epoch: 4 Training Loss:  0.0715204238778714 Valid Loss:  0.3352052476257086
Epoch: 5 Training Loss:  0.06112954479952653 Valid Loss:  0.07890327313604455
Epoch: 6 Training Loss:  0.0497398627075282 Valid Loss:  0.07062170887365937
Epoch: 7 Training Loss:  0.04241228329412865 Valid Loss:  0.06184114965920647
Epoch: 8 Training Loss:  0.03664553004006545 Valid Loss:  0.06988479352245729
Epoch: 9 Training Loss:  0.03334167885848067 Valid Loss:  0.4880245228608449
Epoch: 10 Training Loss:  0.035245017300952565 Valid Loss:  0.06090639275498688


#### Evaluation

In [11]:
model.eval()

y_pred, y_true = [], []

for data, target in testloader:
  predicted = model(data.to(device))
  _, predicted = torch.max(predicted.cpu(), 1)
  y_pred += predicted.tolist()
  y_true += target.tolist()

print("Accuracy: ", accuracy_score(y_pred, y_true))

Accuracy:  0.9851666666666666


### Sentiment Classification [NOT FINISHED]

#### Download Data

In [12]:
from IPython.display import clear_output

!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
clear_output(wait=True)

!gunzip aclImdb_v1.tar.gz
clear_output(wait=True)

!tar -xvf aclImdb_v1.tar
clear_output(wait=True)

!ls

aclImdb  aclImdb_v1.tar  data  sample_data


#### Import

In [13]:
## Load TF 2.0
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
  
import os
from os import listdir
from os.path import isfile, join
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler
from torch.backends import cudnn
import tensorflow as tf
import numpy as np
import multiprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

num_cores = multiprocessing.cpu_count()

Using TensorFlow backend.


#### Data Loader

In [0]:
train_pos_path = "/content/aclImdb/train/pos/"
train_neg_path = "/content/aclImdb/train/neg/"

test_pos_path = "/content/aclImdb/test/pos/"
test_neg_path = "/content/aclImdb/test/neg/"



In [0]:
class ImdbMovieDataset(Dataset):

  def __init__(self, pos_path, neg_path, maxlen=100, text_tokenizer=None):
    """
    Args:

    """
    self.pos_path = pos_path
    self.neg_path = neg_path
    self.pos_files = [file_name[:-4] for file_name in self.get_files(self.pos_path)]
    self.neg_files = [file_name[:-4] for file_name in self.get_files(self.neg_path)]
    self.n_pos_files = len(self.pos_files)
    self.n_neg_files = len(self.neg_files)
    self.maxlen = maxlen
    if text_tokenizer:
      self.text_tokenizer = text_tokenizer
    else:
      self.text_tokenizer = self.fit_tokenizer()
    return


  def __len__(self):
    return (self.n_pos_files + self.n_neg_files)


  def __getitem__(self, idx):
    if idx < self.n_pos_files:
      ## positive review
      path = self.pos_path + self.pos_files[idx]
      y = 1
    else:
      ## negative review
      path = self.neg_path + self.neg_files[idx - self.n_pos_files]
      y = 0
    review = self.read_file(path)
    X = self.text_tokenizer.texts_to_sequences([review])
    X = pad_sequences(sequences=X, 
                      maxlen=self.maxlen, 
                      padding='post', 
                      truncating='post')[0]

    return torch.tensor(X), torch.tensor(y)

  def get_files(self, path):
    return [f for f in listdir(path) if isfile(join(path, f))]

  def read_file(self, path):
    with open(path + '.txt', 'r') as raw_file:
      content_file = raw_file.read()
    return content_file

  def fit_tokenizer(self):
    tmp_tokenizer = Tokenizer(num_words=None,
                      lower=True,
                      oov_token='<UNK>')
    print("Positive file fit Tokenizer")
    for pos_file in self.pos_files[:50]:
      tmp_tokenizer.fit_on_texts([self.read_file(self.pos_path + pos_file)])

    print("Negative file fit Tokenizer")
    for neg_file in self.neg_files[:50]:
      tmp_tokenizer.fit_on_texts([self.read_file(self.neg_path + neg_file)])
    return tmp_tokenizer

In [16]:
## data generator parameters
data_generator_parameters = {'batch_size': 64,
                             'shuffle': True,
                             'num_workers': num_cores}

training_set = ImdbMovieDataset(pos_path=train_pos_path,
                              neg_path=train_neg_path)

Positive file fit Tokenizer
Negative file fit Tokenizer


In [17]:
training_generator = DataLoader(training_set, **data_generator_parameters)

for local_batch, local_label in training_generator:
  print(local_batch, local_label)
  print(np.shape(local_label))
  break

tensor([[  11,  265,    1,  ...,  298,    1,    4],
        [  16,   44,   96,  ..., 4186,    1, 4005],
        [  12,  824,  195,  ...,    2,   98, 1633],
        ...,
        [2509, 3712,    4,  ..., 2629,    6,    1],
        [ 292,  413,   11,  ...,    0,    0,    0],
        [  52,    3,    1,  ..., 4127,   38,  127]], dtype=torch.int32) tensor([1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
        1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1])
torch.Size([64])


In [18]:
torch.tensor(training_set.text_tokenizer.texts_to_sequences(["this is me, not you"]), dtype=torch.long)

tensor([[11,  8, 80, 22, 25]])