# Working with Data

In [1]:
import torch

## Dataset

In [2]:
from torch.utils.data import Dataset, DataLoader

In [3]:
inputs = torch.rand(5, 8)                        # 5 examples with 8 features each
targets = torch.tensor([1, 1, 1, 0, 0])          # a binary class for each of 5 examples

In [4]:
inputs

tensor([[0.9078, 0.8158, 0.0751, 0.6673, 0.9645, 0.4584, 0.2770, 0.4203],
        [0.7066, 0.2162, 0.6203, 0.0974, 0.4527, 0.4128, 0.6775, 0.3781],
        [0.7157, 0.3780, 0.5939, 0.6088, 0.2390, 0.0340, 0.2930, 0.6055],
        [0.6640, 0.7359, 0.6051, 0.8587, 0.4969, 0.7759, 0.1938, 0.6143],
        [0.8989, 0.1281, 0.3761, 0.2822, 0.8641, 0.8334, 0.3186, 0.3283]])

In [5]:
targets # 1:"pos", 0:"neg"

tensor([1, 1, 1, 0, 0])

In [6]:
# A custom Dataset class must implement three functions: __init__, __len__, and __getitem__.

class ReviewsDataset(Dataset):

    def __init__(self, inputs, targets):
        """ The __init__ function is run once when instantiating the Dataset object. """
        
        super().__init__()
        self.reviews = inputs
        self.labels = targets
        
    def __len__(self):
        """ The __len__ function returns the number of samples in our dataset. """
        
        return len(self.reviews)
    
    def __getitem__(self, index):
        """ Returns the tuple of feature tensor and the corresponding label for a given index."""
        review = self.reviews[index]
        label = self.labels[index]
        return review, label

In [7]:
dataset = ReviewsDataset(inputs, targets)

In [8]:
dataset.reviews

tensor([[0.9078, 0.8158, 0.0751, 0.6673, 0.9645, 0.4584, 0.2770, 0.4203],
        [0.7066, 0.2162, 0.6203, 0.0974, 0.4527, 0.4128, 0.6775, 0.3781],
        [0.7157, 0.3780, 0.5939, 0.6088, 0.2390, 0.0340, 0.2930, 0.6055],
        [0.6640, 0.7359, 0.6051, 0.8587, 0.4969, 0.7759, 0.1938, 0.6143],
        [0.8989, 0.1281, 0.3761, 0.2822, 0.8641, 0.8334, 0.3186, 0.3283]])

In [9]:
dataset.labels

tensor([1, 1, 1, 0, 0])

In [10]:
# number of training examples in the dataseet
len(dataset)

5

In [11]:
# retrieve training examples by index
dataset[4]

(tensor([0.8989, 0.1281, 0.3761, 0.2822, 0.8641, 0.8334, 0.3186, 0.3283]),
 tensor(0))

In [12]:
# iterating over training examples
for (feature_tensor, label) in dataset:
    print(feature_tensor, label)

tensor([0.9078, 0.8158, 0.0751, 0.6673, 0.9645, 0.4584, 0.2770, 0.4203]) tensor(1)
tensor([0.7066, 0.2162, 0.6203, 0.0974, 0.4527, 0.4128, 0.6775, 0.3781]) tensor(1)
tensor([0.7157, 0.3780, 0.5939, 0.6088, 0.2390, 0.0340, 0.2930, 0.6055]) tensor(1)
tensor([0.6640, 0.7359, 0.6051, 0.8587, 0.4969, 0.7759, 0.1938, 0.6143]) tensor(0)
tensor([0.8989, 0.1281, 0.3761, 0.2822, 0.8641, 0.8334, 0.3186, 0.3283]) tensor(0)


## DataLoader

The Dataset retrieves our dataset’s features and labels one sample at a time.\
While training a model, we typically want to pass samples in “minibatches” and reshuffle the data at every epoch to reduce model overfitting.\
We can abstract this functionality with DataLoader object.

In [None]:
from torch.utils.data import DataLoader

In [13]:
batch_generator = DataLoader(dataset, batch_size=2, shuffle=True, drop_last=True)

In [14]:
for batch in batch_generator:
    inputs, targets = batch
    print("Inputs:", inputs)
    print("Targets:", targets)
    print()
    break

Inputs: tensor([[0.7066, 0.2162, 0.6203, 0.0974, 0.4527, 0.4128, 0.6775, 0.3781],
        [0.7157, 0.3780, 0.5939, 0.6088, 0.2390, 0.0340, 0.2930, 0.6055]])
Targets: tensor([1, 1])



In [None]:
import pandas as pd
data = pd.read_csv(r"C:\Users\dashb\Downloads\Information retrieval\Taxonomy_Textual_Entailment_PAPERS\LAZY\reviews_with_splits_lite.csv") 

In [None]:
data.head()

In [None]:
data.review.values

In [None]:
import re, torch

def tokenizer(text):
    """Removing punctuation and special symbols from review text"""
    
    clean_text = re.sub("[^a-zA-Z]+", " ", text).lower()
    return clean_text.split() # list of tokens


def vectorize(tokenized_review, dictionary):
    # takes a tokenized review and return a vectorized review
    """
    Creates a collapsed one-hot tensor for a single text review.
    
    Args:
        tokenized_review (List[str]): a list of tokens
    Returns:
        one_hot_tensor (torch.FloatTensor): collapsed one-hot representation of a review
    """
    indexes_list = dictionary.doc2idx(tokenized_review)
    one_hot_vector = torch.zeros(len(dictionary), dtype=torch.float32)
    for token_index in indexes_list:
        one_hot_vector[token_index] = 1
    return one_hot_vector

### Example

In [None]:
sample_review = "I am a big gelato fan!"

In [None]:
tokenized_review = tokenizer(sample_review)
tokenized_review

In [None]:
from gensim.corpora import Dictionary
dictionary = Dictionary(tokenizer(review) for review in data.review.values)

In [None]:
len(dictionary) # number of unique words in your vocabulary

In [None]:
# mapping of token to an integer id
dictionary.token2id

In [None]:
# Convert document (a list of words) into a list of indexes
dictionary.doc2idx(["i", "am", "a", "big", "gelato", "fan", "!"])

In [None]:
# check the indexes assigned to 1
vectorize(tokenized_review, dictionary)#.nonzero()

In [None]:
class CustomDataset(Dataset):

    def __init__(self, documents):
        
        self.reviews = []
        self.labels = []
        
        for _, row in documents.iterrows():
            self.reviews.append(tokenizer(row.review))
            self.labels.append(1 if row.rating == "positive" else 0)
            
        self.dictionary = Dictionary(self.reviews)
        
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, index):
        features = vectorize(self.reviews[index], self.dictionary) 
        label = torch.tensor(self.labels[index])
        return features, label

In [None]:
dataset = CustomDataset(data)

In [None]:
set(dataset.labels)

In [None]:
len(dataset.dictionary) 

In [None]:
dataset[2]

# nn.Module

How to use the predefined layers in PyTorch

In [15]:
import torch.nn as nn

In [45]:
# initialize a linear layer
linear_layer = nn.Linear(in_features=8, out_features=3)

In [46]:
# accessing weight marix
linear_layer.weight

Parameter containing:
tensor([[-0.1989,  0.2530,  0.2782,  0.0793,  0.0021,  0.0763,  0.2892, -0.2072],
        [ 0.2073, -0.0824,  0.1744, -0.0292,  0.1567, -0.0249,  0.2070,  0.2197],
        [-0.2447,  0.3461, -0.0079, -0.1518, -0.2538, -0.0242,  0.0230,  0.3271]],
       requires_grad=True)

In [47]:
# accessing parameters by name:
dict(linear_layer.named_parameters())

{'weight': Parameter containing:
 tensor([[-0.1989,  0.2530,  0.2782,  0.0793,  0.0021,  0.0763,  0.2892, -0.2072],
         [ 0.2073, -0.0824,  0.1744, -0.0292,  0.1567, -0.0249,  0.2070,  0.2197],
         [-0.2447,  0.3461, -0.0079, -0.1518, -0.2538, -0.0242,  0.0230,  0.3271]],
        requires_grad=True),
 'bias': Parameter containing:
 tensor([-0.1301, -0.0660, -0.0770], requires_grad=True)}

In [19]:
# check the shape of weight matrix
linear_layer.weight.shape

torch.Size([1, 8])

In [48]:
# call the layer directly on input tensor
linear_layer(torch.randn(8))

tensor([-0.2010, -1.2905,  0.4083], grad_fn=<AddBackward0>)

In [49]:
torch.sigmoid(torch.tensor([-2, 1.5]))

tensor([0.1192, 0.8176])

In [None]:
activation = torch.Sigmoid()

## Building your neural network

In [None]:
# Building your network: stack PyTorch layers inside your network object

In [23]:
class NeuralModel(nn.Module):
    
    """Single-layer neural network with a sigmoid activation."""
    
    def __init__(self, num_features):
        super().__init__()
        # define layers of your Neural Network
        
        self.linear_layer  = nn.Linear(in_features = num_features, out_features = 1)
        self.activation = nn.Sigmoid()
        
    def forward(self, inputs):
        output = self.linear_layer(inputs)
        probabilities = self.activation(output)
        return probabilities

In [24]:
# instantiate the model
model = NeuralModel(inputs.shape[1])

In [25]:
print("Model architecture:", model)

Model architecture: NeuralModel(
  (linear_layer): Linear(in_features=8, out_features=1, bias=True)
  (activation): Sigmoid()
)


In [26]:
dict(model.named_parameters())

{'linear_layer.weight': Parameter containing:
 tensor([[0.3245, 0.0181, 0.1568, 0.2189, 0.0286, 0.1688, 0.0049, 0.3388]],
        requires_grad=True),
 'linear_layer.bias': Parameter containing:
 tensor([0.0487], requires_grad=True)}

<h6 style="text-align"><b> Forward pass</b>

In [27]:
model(inputs)

tensor([[0.6490],
        [0.6755]], grad_fn=<SigmoidBackward0>)

##  Train your NN 

In [28]:
# Choose the loss function
loss_function = nn.BCELoss()

In [29]:
# example of usage
prediction = torch.rand(3)
target = torch.ones(3).to(torch.float)
loss_function(prediction, target)

tensor(0.6932)

In [30]:
from torch import optim
dir(optim)[:10]

['ASGD',
 'Adadelta',
 'Adagrad',
 'Adam',
 'AdamW',
 'Adamax',
 'LBFGS',
 'NAdam',
 'Optimizer',
 'RAdam']

In [31]:
# Choose the optimizer
optimizer = optim.Adam(model.parameters())

<h6 style="text-align"><b> Take a single training step</b>

In [32]:
# Step 0. Get a single training example
inputs, target = dataset[0]
inputs, target

(tensor([0.9078, 0.8158, 0.0751, 0.6673, 0.9645, 0.4584, 0.2770, 0.4203]),
 tensor(1))

In [33]:
# Step 1. Make the prediction
prediction = model(inputs)
prediction 

tensor([0.6824], grad_fn=<SigmoidBackward0>)

In [34]:
target

tensor(1)

In [35]:
# Step 2. Compute prediction error
error = loss_function(prediction.squeeze(), target.to(torch.float))
error

tensor(0.3822, grad_fn=<BinaryCrossEntropyBackward0>)

In [36]:
# Step 3. Propagate the error signal backward
error.backward()

In [37]:
model.linear_layer.weight.grad

tensor([[-0.2884, -0.2591, -0.0239, -0.2120, -0.3064, -0.1456, -0.0880, -0.1335]])

In [38]:
# Step 4. Use optimizer to take the optimization step, i.e. update weights and biases of your model
optimizer.step()

In [39]:
# Step 5. Clear the gradients on weights and biases, otherwise they cumulate across iterations
optimizer.zero_grad()

In [40]:
model.linear_layer.weight.grad

tensor([[0., 0., 0., 0., 0., 0., 0., 0.]])

In [None]:
num_epochs = 10

for epoch in range(num_epochs):
    
    # training  loop
    model.train()
    
    for inputs, targets in batch_generator:
        # 1. Generate predictions
        predictions = model(inputs).squeeze()
        # 2. Calculate loss
        loss = loss_function(predictions, targets.to(torch.float))
        # 3. Compute gradients
        loss.backward()
        # 4. Update parameters using gradients
        optimizer.step()
        # 5. Reset the gradients to zero
        optimizer.zero_grad() 
        if epoch % 5==0:
            print(f'Epoch {epoch} Loss {loss.item():.4f}')
            
    # validation loop
    model.eval()
    
    for inputs, targets in eval_generator:
        
        # stop the gradient tracking with torch.no_grad() context manager
        with torch.no_grad():
            # 1. Generate predictions
            predictions = model(inputs).squeeze()
            # 2. Calculate loss
            loss = loss_function(predictions, targets.to(torch.float))
            # 6. Reporting
            if epoch % 5==0:
                print(f'Epoch {epoch} Loss {loss.item():.4f}')

### Saving and Loading your model

In [None]:
torch.save(model.state_dict(), "model_params.pt")

In [41]:
dict(model.state_dict())

{'linear_layer.weight': tensor([[0.3255, 0.0191, 0.1578, 0.2199, 0.0296, 0.1698, 0.0059, 0.3398]]),
 'linear_layer.bias': tensor([0.0497])}

In [None]:
# create the model architecture first, then load parameters
model.load_state_dict(torch.load("model_params.pt"))

In [None]:
model.linear_layer.weight