<a href="https://colab.research.google.com/github/farshidehkordi/Homework2_AI/blob/main/TP_2_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import logging
import random
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from poutyne.framework import Model


class WordClassifier(nn.Module):
    def __init__(self, vocab, embedding_size, hidden_size, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(len(vocab), embedding_size, padding_idx=0)
        self.rnn = nn.RNN(embedding_size, hidden_size, batch_first=True)
        self.mapping_layer = nn.Linear(hidden_size, num_classes)

    def forward(self, inputs):
        embedded = self.embedding(inputs)  # Shape: (batch_size, seq_len, embedding_size)
        output, _ = self.rnn(embedded)     # Shape: (batch_size, seq_len, hidden_size)
        last_output = output[:, -1, :]     # Take the last output of the sequence
        logits = self.mapping_layer(last_output)  # Shape: (batch_size, num_classes)
        return logits


class WordClassifierHandlingPadding(WordClassifier):
    def __init__(self, vocab, embedding_size, hidden_size, num_classes):
        super().__init__(vocab, embedding_size, hidden_size, num_classes)

    def forward(self, inputs):
        pass


def vectorize_dataset(dataset, char_to_idx, class_to_idx):
    vectorized_dataset = list()
    for word, lang in dataset:
        label = class_to_idx[lang]
        vectorized_word = list()
        for char in word:
            vectorized_word.append(char_to_idx.get(char, 1))  # Get the char index otherwise set to unknown char
        vectorized_dataset.append((vectorized_word, label))
    return vectorized_dataset


def load_data(filename):
    examples = list()
    with open(filename) as fhandle:
        for line in fhandle:
            examples.append(line[:-1].split())
    return examples


def create_indexes(examples):
    char_to_idx = {"<pad>": 0, "<unk>": 1}
    class_to_idx = {}

    for word, lang in examples:
        if lang not in class_to_idx:
            class_to_idx[lang] = len(class_to_idx)
        for char in word:
            if char not in char_to_idx:
                char_to_idx[char] = len(char_to_idx)
    return char_to_idx, class_to_idx


def make_max_padded_dataset(dataset):
    max_length = max([len(w) for w, l in dataset])
    tensor_dataset = torch.zeros((len(dataset), max_length), dtype=torch.long)
    labels = list()
    for i, (word, label) in enumerate(dataset):
        tensor_dataset[i, :len(word)] = torch.LongTensor(word)
        labels.append(label)
    return tensor_dataset, torch.LongTensor(labels)


def collate_examples(samples):
    pass


def main():
    batch_size = 128
    training_set = load_data("./data/train.txt")
    test_set = load_data("./data/test.txt")

    char_to_idx, class_to_idx = create_indexes(training_set)

    vectorized_train = vectorize_dataset(training_set, char_to_idx, class_to_idx)
    vectorized_test = vectorize_dataset(test_set, char_to_idx, class_to_idx)

    X_train, y_train = make_max_padded_dataset(vectorized_train)
    X_test, y_test = make_max_padded_dataset(vectorized_test)

    train_dataset = TensorDataset(X_train, y_train)
    test_dataset = TensorDataset(X_test, y_test)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    network = WordClassifier(char_to_idx, 10, 10, len(class_to_idx))
    model = Model(network, 'sgd', 'cross_entropy', batch_metrics=['accuracy'])
    model.fit_generator(train_loader, epochs=5)
    loss, acc = model.evaluate_generator(test_loader)
    logging.info("1 - Loss: {}\tAcc:{}".format(loss, acc))


if __name__ == "__main__":
    logging.getLogger().setLevel(logging.INFO)
    np.random.seed(42)
    random.seed(42)
    torch.manual_seed(42)
    torch.cuda.manual_seed(42)
    main()
