In [37]:
from tqdm import tqdm
import pickle
import numpy as np
import torch
import torchtext.datasets as datasets
import torch.utils.data as data
import typing as t
import numpy as np

seed = 42
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Task I: Word-based CNN for Text Classification

### 1. Data

The dataset that we are going to use is the imdb dataset of movie reviews. These are labelled by sentiment (positive/negative). 

The reviews have been preprocessed, and each review is encoded as a sequence of word indexes (integers). 

For convenience, words are indexed by overall frequency in the dataset, so that for instance the integer "3" encodes the 3rd most frequent word in the data. This allows for quick filtering operations such as: "only consider the top 10,000 most common words, but eliminate the top 20 most common words".

More information regarding the dataset can be found in the official [documentation](https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification).


### 2. Preprocess the text data 

In this particular case, where we are using the imdb dataset there is no need to do all the traditional preprocessings that we normally do when dealing with NLP problems. Part of them are already done at this point.

  - Split the dataset in train and test (maybe also validation).
  - Tokenize and transform to integer index. Here we would need to: 
    - instantitate a *Tokenizer()* object, 
    - fit that object on the text on which we are training the model (use the *fit_on_texts()* method)
    - call *texts_to_sequences()* for both the training and the test text.

  - **Add padding to ensure that all vectors have the same dimensionality.** Note that this is the only pre-processing that needs to be done in the case of the current imdb dataset.

In [47]:
class IMDBDataset(data.Dataset):
    padding_idx: int = 10001

    def __init__(self, length: int=1000, split: t.Literal['train', 'test']='train'):
        super(IMDBDataset, self).__init__()

        # Load the dataset
        train_samples, train_labels, test_samples, test_labels = pickle.load(open('./imdb.pkl', 'rb'))

        # Persist only the specific split
        self.samples: t.List[t.List[int]] = train_samples if split == 'train' else test_samples
        self.labels: t.List[int] = train_labels if split == 'train' else test_labels
        self.length = length
        self.padding_idx = IMDBDataset.padding_idx

    def __getitem__(self, index) -> t.Tuple[torch.Tensor, torch.Tensor]:
        sample = self.samples[index]
        label = self.labels[index]

        # Perform trimming
        if len(sample) > self.length:
            sample = sample[:self.length]

        # Transform to tensor
        sample = torch.tensor(sample)
        label = torch.tensor(label)

        # Perform padding
        sample = torch.nn.functional.pad(sample, (0, self.length - len(sample)), value=self.padding_idx)
        return sample, label

    def __len__(self) -> int:
        return len(self.samples)


gen = torch.Generator(device='cpu').manual_seed(seed)
train_data = IMDBDataset(split='train')
test_data = IMDBDataset(split='test')
train_data, valid_data = data.random_split(train_data, [0.8, 0.2], gen)

### 3.  Define the model de dataset and the training loop

Similar to the privious lab while following the model architecture described in the comments.

In [64]:
import torch.nn as nn


class ConvLayer(torch.nn.Module):
    def __init__(self,
                 input_channels: int,
                 output_channels: int,
                 kernel_size: int,
                 padding: int):
        super(ConvLayer, self).__init__()

        # Create layers
        self.conv = nn.Conv1d(in_channels=input_channels,
                              out_channels=output_channels,
                              kernel_size=kernel_size,
                              padding=padding)
        self.bn = nn.BatchNorm1d(num_features=output_channels)
        self.activ_fn = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        x: torch.Tensor = self.conv(input)
        x = self.bn(x)
        x = self.activ_fn(x)
        x = self.pool(x)
        return x


class TextCNN(torch.nn.Module):
    def __init__(self, padding_idx: int=IMDBDataset.padding_idx):
        super(TextCNN, self).__init__()

        # Model Configuration
        self.num_embeddings = 10002
        self.embedding_dim = 100

        # Define an embedding layer with a vocabulary size of 10002
        # an output embedding size of 100
        # and a padding_idx equal to the one used - 10001
        self.embedding = nn.Embedding(num_embeddings=self.num_embeddings,
                                      embedding_dim=self.embedding_dim,
                                      padding_idx=padding_idx)

        # Define the following sequence of layers
        # A dropout layer with a probability of 0.4
        self.drop1 = nn.Dropout(p=0.4)

        # A 1D Convolutional layer with 100 input channels, 128 output channels, kernel size of 3 and a padding of 1
        # A 1D Batch Normalization Layer for 128 features
        # A ReLU activation
        # A 1D Maxpooling layer with size 2
        self.conv_layer1 = ConvLayer(100, 128, 3, 1)

        # A 1D Convolutional layer with 128 input channels, 128 output channels, kernel size of 5 and a padding of 2
        # A 1D Batch Normalization Layer for 128 features
        # A ReLU activation
        # A 1D Maxpooling layer with size 2
        self.conv_layer2 = ConvLayer(128, 128, 5, 2)

        # A 1D Convolutional layer with 128 input channels, 128 output channels, kernel size of 5 and a padding of 2
        # A 1D Batch Normalization Layer for 128 features
        # A ReLU activation
        # A 1D Maxpooling layer with size 2
        self.conv_layer3 = ConvLayer(128, 128, 5, 2)

        # A global Average pooling layer, which in this scenario, will be an 1D Avgerage Pooling layer
        # with size 125 and stride 125
        self.pool = nn.AvgPool1d(125, 125)

        # A Flattening layer
        self.flatten = nn.Flatten()

        # A Linear layer with 128 input features and 2 outputs and no activation function
        self.dense = nn.Linear(in_features=128, out_features=2)


    def forward(self, input):
        # forward the input through the embedding layer
        x: torch.Tensor = self.embedding(input)

        # permute the input such that it becomes channels first
        x = x.permute((1, 0))

        # forward the input through the rest of the sequence of layers
        x = self.conv_layer1(x)
        x = self.conv_layer2(x)
        x = self.conv_layer3(x)
        x = self.pool(x)
        x = self.flatten(x)
        x = self.dense(x)
        return x


model = TextCNN()

# instantiate the model
# define an Adam optimizer for the model with a lr of 1e-3
# define a Cross Entropy loss function

# define the training dataset and dataloader and the test dataset and dataloader

# write the training loop as defined in Lab 1 and train the model

In [65]:
sample = train_data[0][0]

In [73]:
torch.cat([train_data[0][0], train_data[1][0]])

tensor([    1,  3797,    10,  ..., 10001, 10001, 10001])

In [None]:
train_data[0]

In [67]:
model(sample)

RuntimeError: running_mean should contain 1000 elements not 128