<a href="https://colab.research.google.com/github/farshidehkordi/Homework2_AI/blob/main/TP2_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os


# List files in the current directory
print(os.listdir())

['.config', 'sample_data']


In [4]:
!unzip ./codeTP2.zip
# extract files


Archive:  ./codeTP2.zip
   creating: code/data-q2/
  inflating: code/data-q2/test.txt   
  inflating: code/data-q2/train.txt  
  inflating: code/q2-RNN.py          
  inflating: code/question1.py       


In [3]:
!pip install poutyne

Collecting poutyne
  Downloading Poutyne-1.17.1-py3-none-any.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.5/213.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics (from poutyne)
  Downloading torchmetrics-1.3.2-py3-none-any.whl (841 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m841.5/841.5 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->poutyne)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->poutyne)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->poutyne)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->poutyne)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manyli

Initial architecture with the bidirectional WordClassifier class

In [27]:
import logging
import random
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from poutyne.framework import Model
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence



# Define embedding_size and hidden_size
embedding_size = 10
hidden_size = 10


# bidirectional
class WordClassifier(nn.Module):
    def __init__(self, vocab, embedding_size, hidden_size, num_classes, dropout_prob=0.5):
        super().__init__()
        self.embedding = nn.Embedding(len(vocab), embedding_size, padding_idx=0)
        self.rnn = nn.RNN(embedding_size, hidden_size, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_prob)  # Dropout layer
        self.mapping_layer = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, inputs):
        embedded = self.embedding(inputs)
        output, _ = self.rnn(embedded)
        last_output = output[:, -1, :]
        last_output = self.dropout(last_output)  # Apply dropout
        logits = self.mapping_layer(last_output)
        return logits

class WordClassifierHandlingPadding(WordClassifier):
    def __init__(self, vocab, embedding_size, hidden_size, num_classes):
        super().__init__(vocab, embedding_size, hidden_size, num_classes)

    def forward(self, inputs):
        # If inputs is a tuple, assume it contains sequences and lengths
        if isinstance(inputs, tuple):
            sequences, lengths = inputs
        # If inputs is not a tuple, assume it contains only sequences and compute lengths
        else:
            sequences = inputs
            lengths = torch.count_nonzero(sequences, dim=1).cpu().tolist()

        # Embed the sequences
        embedded = self.embedding(sequences)  # Shape: (batch_size, seq_len, embedding_size)

        # Pack the embedded sequences to handle padding
        packed_embedded = pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)

        # Pass the packed sequences through the RNN
        packed_output, _ = self.rnn(packed_embedded)

        # Unpack the packed output
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        # Take the last output of each sequence
        last_output = output[:, -1, :]

        # Pass the last output through the mapping layer
        logits = self.mapping_layer(last_output)  # Shape: (batch_size, num_classes)

        return logits




def vectorize_dataset(dataset, char_to_idx, class_to_idx):
    vectorized_dataset = list()
    for word, lang in dataset:
        label = class_to_idx[lang]
        vectorized_word = list()
        for char in word:
            # Check if the character is in the vocabulary
            if char in char_to_idx:
                vectorized_word.append(char_to_idx[char])
            else:
                # If the character is not in the vocabulary, use the unknown token index
                vectorized_word.append(char_to_idx["<unk>"])
        vectorized_dataset.append((vectorized_word, label))
    return vectorized_dataset



def load_data(filename):
    examples = list()
    with open(filename) as fhandle:
        for line in fhandle:
            examples.append(line[:-1].split())
    return examples


def create_indexes(examples):
    char_to_idx = {"<pad>": 0, "<unk>": 1}
    class_to_idx = {}

    for word, lang in examples:
        if lang not in class_to_idx:
            class_to_idx[lang] = len(class_to_idx)
        for char in word:
            if char not in char_to_idx:
                char_to_idx[char] = len(char_to_idx)
    return char_to_idx, class_to_idx


def make_max_padded_dataset(dataset):
    max_length = max([len(w) for w, l in dataset])
    tensor_dataset = torch.zeros((len(dataset), max_length), dtype=torch.long)
    labels = list()
    for i, (word, label) in enumerate(dataset):
        tensor_dataset[i, :len(word)] = torch.LongTensor(word)
        labels.append(label)
    return tensor_dataset, torch.LongTensor(labels)



def collate_examples(samples, padding_value=0):
    # Find the maximum sequence length within the batch
    max_len = max(len(word) for word, _ in samples)

    padded_sequences = []
    labels = []

    # Pad each sequence to have the same length as the longest sequence in the batch
    for word, label in samples:
        # Pad the sequence to match the maximum length
        padded_sequence = torch.tensor(word + [padding_value] * (max_len - len(word)), dtype=torch.long)
        padded_sequences.append(padded_sequence)
        labels.append(label)

    # Convert the lists to PyTorch tensors
    padded_sequences_tensor = torch.stack(padded_sequences)
    labels_tensor = torch.tensor(labels, dtype=torch.long)

    return padded_sequences_tensor, labels_tensor



def main():
    batch_size = 128
    training_set = load_data("./code/data-q2/train.txt")
    test_set = load_data("./code/data-q2/test.txt")

    char_to_idx, class_to_idx = create_indexes(training_set)

    vectorized_train = vectorize_dataset(training_set, char_to_idx, class_to_idx)
    vectorized_test = vectorize_dataset(test_set, char_to_idx, class_to_idx)

    X_train, y_train = make_max_padded_dataset(vectorized_train)
    X_test, y_test = make_max_padded_dataset(vectorized_test)

    train_dataset = TensorDataset(X_train, y_train)
    test_dataset = TensorDataset(X_test, y_test)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)


    # Define the network
    network = WordClassifier(char_to_idx, embedding_size, hidden_size, len(class_to_idx), dropout_prob=0.7)

    # Define optimizer with weight decay
    optimizer = SGD(network.parameters(), lr=0.01, weight_decay=1e-5)


    # 1: Créez un réseau simple qui prend en entré des exemples de longueur fixes (max length)
    network = WordClassifier(char_to_idx, 10, 10, len(class_to_idx))
    model = Model(network, 'sgd', 'cross_entropy', batch_metrics=['accuracy'])
    model.fit_generator(train_loader, epochs=5)
    loss, acc = model.evaluate_generator(test_loader)
    logging.info("1 - Loss: {}\tAcc:{}".format(loss, acc))


if __name__ == "__main__":
    logging.getLogger().setLevel(logging.INFO)
    np.random.seed(42)
    random.seed(42)
    torch.manual_seed(42)
    torch.cuda.manual_seed(42)
    main()


Epoch: 1/5 Train steps: 1696 26.64s loss: 2.297778 acc: 11.963394                                
Epoch: 2/5 Train steps: 1696 26.92s loss: 2.292957 acc: 12.812656                                
Epoch: 3/5 Train steps: 1696 29.52s loss: 2.291645 acc: 13.009419                                
Epoch: 4/5 Train steps: 1696 27.20s loss: 2.291035 acc: 13.029694                                
Epoch: 5/5 Train steps: 1696 26.10s loss: 2.290884 acc: 13.034763                                
Test steps: 424 2.14s test_loss: 2.291269 test_acc: 13.042356                                


INFO:root:1 - Loss: 2.291268605428655	Acc:13.042356324309239


Adding the collate_examples function

In [None]:
import logging
import random
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from poutyne.framework import Model
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.optim import SGD


# Define embedding_size and hidden_size
embedding_size = 10
hidden_size = 10


# bidirectional
class WordClassifier(nn.Module):
    def __init__(self, vocab, embedding_size, hidden_size, num_classes, dropout_prob=0.5):
        super().__init__()
        self.embedding = nn.Embedding(len(vocab), embedding_size, padding_idx=0)
        self.rnn = nn.RNN(embedding_size, hidden_size, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_prob)  # Dropout layer
        self.mapping_layer = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, inputs):
        embedded = self.embedding(inputs)
        output, _ = self.rnn(embedded)
        last_output = output[:, -1, :]
        last_output = self.dropout(last_output)  # Apply dropout
        logits = self.mapping_layer(last_output)
        return logits



class WordClassifierHandlingPadding(WordClassifier):
    def __init__(self, vocab, embedding_size, hidden_size, num_classes):
        super().__init__(vocab, embedding_size, hidden_size, num_classes)

    def forward(self, inputs):
        # If inputs is a tuple, assume it contains sequences and lengths
        if isinstance(inputs, tuple):
            sequences, lengths = inputs
        # If inputs is not a tuple, assume it contains only sequences and compute lengths
        else:
            sequences = inputs
            lengths = torch.count_nonzero(sequences, dim=1).cpu().tolist()

        # Embed the sequences
        embedded = self.embedding(sequences)  # Shape: (batch_size, seq_len, embedding_size)

        # Pack the embedded sequences to handle padding
        packed_embedded = pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)

        # Pass the packed sequences through the RNN
        packed_output, _ = self.rnn(packed_embedded)

        # Unpack the packed output
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        # Take the last output of each sequence
        last_output = output[:, -1, :]

        # Pass the last output through the mapping layer
        logits = self.mapping_layer(last_output)  # Shape: (batch_size, num_classes)

        return logits




def vectorize_dataset(dataset, char_to_idx, class_to_idx):
    vectorized_dataset = list()
    for word, lang in dataset:
        label = class_to_idx[lang]
        vectorized_word = list()
        for char in word:
            # Check if the character is in the vocabulary
            if char in char_to_idx:
                vectorized_word.append(char_to_idx[char])
            else:
                # If the character is not in the vocabulary, use the unknown token index
                vectorized_word.append(char_to_idx["<unk>"])
        vectorized_dataset.append((vectorized_word, label))
    return vectorized_dataset



def load_data(filename):
    examples = list()
    with open(filename) as fhandle:
        for line in fhandle:
            examples.append(line[:-1].split())
    return examples


def create_indexes(examples):
    char_to_idx = {"<pad>": 0, "<unk>": 1}
    class_to_idx = {}

    for word, lang in examples:
        if lang not in class_to_idx:
            class_to_idx[lang] = len(class_to_idx)
        for char in word:
            if char not in char_to_idx:
                char_to_idx[char] = len(char_to_idx)
    return char_to_idx, class_to_idx


def make_max_padded_dataset(dataset):
    max_length = max([len(w) for w, l in dataset])
    tensor_dataset = torch.zeros((len(dataset), max_length), dtype=torch.long)
    labels = list()
    for i, (word, label) in enumerate(dataset):
        tensor_dataset[i, :len(word)] = torch.LongTensor(word)
        labels.append(label)
    return tensor_dataset, torch.LongTensor(labels)



def collate_examples(samples, padding_value=0):
    # Find the maximum sequence length within the batch
    max_len = max(len(word) for word, _ in samples)

    padded_sequences = []
    labels = []

    # Pad each sequence to have the same length as the longest sequence in the batch
    for word, label in samples:
        # Pad the sequence to match the maximum length
        padded_sequence = torch.tensor(word + [padding_value] * (max_len - len(word)), dtype=torch.long)
        padded_sequences.append(padded_sequence)
        labels.append(label)

    # Convert the lists to PyTorch tensors
    padded_sequences_tensor = torch.stack(padded_sequences)
    labels_tensor = torch.tensor(labels, dtype=torch.long)

    return padded_sequences_tensor, labels_tensor



def main():
    batch_size = 128
    training_set = load_data("./code/data-q2/train.txt")
    test_set = load_data("./code/data-q2/test.txt")

    char_to_idx, class_to_idx = create_indexes(training_set)

    vectorized_train = vectorize_dataset(training_set, char_to_idx, class_to_idx)
    vectorized_test = vectorize_dataset(test_set, char_to_idx, class_to_idx)

    X_train, y_train = make_max_padded_dataset(vectorized_train)
    X_test, y_test = make_max_padded_dataset(vectorized_test)

    train_dataset = TensorDataset(X_train, y_train)
    test_dataset = TensorDataset(X_test, y_test)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # Define the network
    # Define the network
    network = WordClassifier(char_to_idx, embedding_size, hidden_size, len(class_to_idx), dropout_prob=0.7)

    # Define optimizer with weight decay
    optimizer = SGD(network.parameters(), lr=0.01, weight_decay=1e-5)


    # 2: Ensure padding is done "on batch"
    train_loader = DataLoader(vectorized_train, batch_size=128, shuffle=True, collate_fn=collate_examples)
    test_loader = DataLoader(vectorized_test, batch_size=128, collate_fn=collate_examples)
    model = Model(network, optimizer, 'cross_entropy', batch_metrics=['accuracy'])
    model.fit_generator(train_loader, epochs=5)
    loss, acc = model.evaluate_generator(test_loader)
    logging.info("2 - Loss: {}\tAcc:{}".format(loss, acc))




if __name__ == "__main__":
    logging.getLogger().setLevel(logging.INFO)
    np.random.seed(42)
    random.seed(42)
    torch.manual_seed(42)
    torch.cuda.manual_seed(42)
    main()


Epoch: 1/5 Train steps: 1696 20.62s loss: 2.303213 acc: 11.609496                                
Epoch: 2/5 Train steps: 1696 20.31s loss: 2.292226 acc: 13.107109                                
Epoch: 3/5 Train steps: 1696 20.44s loss: 2.098547 acc: 19.032588                                
Epoch: 4/5 Train steps: 1696 21.51s loss: 2.180794 acc: 16.213389                                
Epoch: 5/5 Train steps: 1696 20.14s loss: 1.986190 acc: 21.363796                                
Test steps: 424 2.53s test_loss: 1.786515 test_acc: 25.050688                                 


INFO:root:2 - Loss: 1.7865145067761359	Acc:25.05068751077908


Added WordClassifierHandlingPadding function

In [None]:
import logging
import random
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from poutyne.framework import Model
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.nn.init import xavier_uniform_
from torch.nn.utils import clip_grad_norm_

# Define hyperparameters
embedding_size = 50
hidden_size = 100
learning_rate = 0.01
weight_decay = 1e-4
num_epochs = 20
batch_size = 128
gradient_clip = 1.0


class WordClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, num_classes, dropout_prob):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
        self.rnn = nn.RNN(embedding_size, hidden_size, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_prob)
        self.mapping_layer = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, inputs):
        embedded = self.embedding(inputs)
        output, _ = self.rnn(embedded)
        last_output = output[:, -1, :]
        last_output = self.dropout(last_output)
        logits = self.mapping_layer(last_output)
        return logits


class WordClassifierHandlingPadding(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, num_classes, dropout_prob):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
        self.rnn_padding = nn.RNN(embedding_size, hidden_size, batch_first=True, bidirectional=True, num_layers=2)
        self.dropout = nn.Dropout(dropout_prob)
        self.mapping_layer = nn.Linear(hidden_size * 2, num_classes)
        self.batch_norm = nn.BatchNorm1d(embedding_size)

        # Initialize the weights
        self.init_weights()

    def init_weights(self):
        # Initialize embedding layer
        xavier_uniform_(self.embedding.weight.data)

        # Initialize RNN weights
        for layer in self.rnn_padding._all_weights:
            for param in layer:
                if 'weight' in param:
                    xavier_uniform_(getattr(self.rnn_padding, param))

        # Initialize linear layer
        xavier_uniform_(self.mapping_layer.weight.data)

    def forward(self, inputs):
        if isinstance(inputs, tuple):
            sequences, lengths = inputs
        else:
            sequences = inputs
            lengths = torch.count_nonzero(sequences, dim=1).cpu().tolist()

        embedded = self.embedding(inputs)
        embedded = self.batch_norm(embedded.permute(0, 2, 1)).permute(0, 2, 1)
        packed_embedded = pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.rnn_padding(packed_embedded)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        last_output = output[:, -1, :]
        last_output = self.dropout(last_output)
        logits = self.mapping_layer(last_output)
        return logits


def load_data(filename):
    examples = []
    with open(filename) as f:
        for line in f:
            examples.append(line.strip().split())
    return examples


def create_indexes(examples):
    char_to_idx = {"<pad>": 0, "<unk>": 1}
    class_to_idx = {}
    for word, lang in examples:
        class_to_idx.setdefault(lang, len(class_to_idx))
        for char in word:
            char_to_idx.setdefault(char, len(char_to_idx))
    return char_to_idx, class_to_idx


def vectorize_dataset(dataset, char_to_idx, class_to_idx):
    vectorized_dataset = []
    for word, lang in dataset:
        label = class_to_idx.get(lang, 0)
        vectorized_word = [char_to_idx.get(char, 1) for char in word]
        vectorized_dataset.append((vectorized_word, label))
    return vectorized_dataset


def make_max_padded_dataset(dataset):
    max_length = max(len(w) for w, _ in dataset)
    tensor_dataset = torch.zeros((len(dataset), max_length), dtype=torch.long)
    labels = []
    for i, (word, label) in enumerate(dataset):
        tensor_dataset[i, :len(word)] = torch.LongTensor(word)
        labels.append(label)
    return tensor_dataset, torch.LongTensor(labels)


def main():
    training_set = load_data("./code/data-q2/train.txt")
    test_set = load_data("./code/data-q2/test.txt")

    char_to_idx, class_to_idx = create_indexes(training_set)

    vectorized_train = vectorize_dataset(training_set, char_to_idx, class_to_idx)
    vectorized_test = vectorize_dataset(test_set, char_to_idx, class_to_idx)

    X_train, y_train = make_max_padded_dataset(vectorized_train)
    X_test, y_test = make_max_padded_dataset(vectorized_test)

    train_dataset = TensorDataset(X_train, y_train)
    test_dataset = TensorDataset(X_test, y_test)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # Define the network
    network = WordClassifierHandlingPadding(len(char_to_idx), embedding_size, hidden_size, len(class_to_idx),
                                            dropout_prob=0.7)

    # Define the optimizer and scheduler
    optimizer = Adam(network.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)

    # Create the Poutyne Model
    model = Model(network, optimizer, 'cross_entropy', batch_metrics=['accuracy'])

   # Training loop
    for epoch in range(num_epochs):
        model.fit_generator(train_loader)

        loss, acc = model.evaluate_generator(test_loader)
        logging.info("Epoch: {}/{} Loss: {:.4f} Accuracy: {:.4f}".format(epoch + 1, num_epochs, loss, acc))
        scheduler.step(loss)

if __name__ == "__main__":
    # Set random seed for reproducibility
    np.random.seed(42)
    random.seed(42)
    torch.manual_seed(42)
    torch.cuda.manual_seed(42)

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    main()


Epoch:    1/1000 Train steps: 1696 2m12.48s loss: 2.315458 acc: 13.302490                              
Epoch:    2/1000 Train steps: 1696 4m14.75s loss: 2.322811 acc: 13.244890                              
Epoch:    3/1000 Train steps: 1696 4m15.13s loss: 2.324293 acc: 13.230144                              
Epoch:    4/1000 Train steps: 1696 4m14.77s loss: 2.323531 acc: 13.263322                              
Epoch:    5/1000 Train steps: 1696 4m8.44s loss: 2.323611 acc: 13.221849                              
Epoch:    6/1000 Train steps: 1696 4m10.60s loss: 2.320723 acc: 13.224614                              
Epoch:    7/1000 Train steps: 1696 4m9.83s loss: 2.316950 acc: 13.317697                               
Epoch:    8/1000 Train steps: 1696 4m9.82s loss: 2.320228 acc: 13.284519                              
Epoch:    9/1000 Step:   91/1696   5.37% |█                   |ETA: 3m47.15s loss: 2.409532 acc: 12.500000