## Tarea 05 - Giovanni Gamaliel López Padilla
### Procesamiento de lenguaje natural
#### Ejercicio 01

## datsets

In [2]:
from argparse import Namespace
from os import makedirs
import numpy as np
import random
import torch


def get_params() -> dict:
    params = {
        "path data": "../Data",
        "train data": "mex_train.txt",
        "train labels": "mex_train_labels.txt",
        "validation data": "mex_val.txt",
        "validation labels": "mex_val_labels.txt",
        "path model": "../Data/Model_01",
        "file model": "model_best.pt",
        "stadistics  file": "stadistics.csv",
    }
    return params


def get_args() -> Namespace:
    args = Namespace()
    args.batch_size = 64
    args.num_workers = 2
    args.N = 6
    # Dimension of word Embeddings
    args.d = 100
    # Dimension for Hidden Layer
    args.d_h = 200
    args.dropout = 0.1
    # Training hyperparameters
    args.lr = 2.3e-1
    args.num_epochs = 100
    args.patience = 20
    # Scheduler hyperparameters
    args.lr_patience = 10
    args.lr_factor = 0.5
    # Save directory
    args.savedir = "model"
    makedirs(args.savedir, exist_ok=True)
    return args


def init_seeds() -> None:
    seed = 1111
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.benchmark = False

### ngram_class

In [3]:
from nltk.tokenize import TweetTokenizer as tokenizer
from nltk import FreqDist, ngrams
from numpy import array, empty
from numpy.random import rand


class ngram_model:

    def __init__(self,
                 N: int,
                 vocab_max: int = 5000,
                 tokenize: tokenizer = None,
                 embeddings_model=None) -> None:
        self.tokenize = tokenize if tokenize else self.default_tokenize
        self.punct = set([
            '.', ',', ';', ':', '-', '^', '»', '!', '¡', '¿', '?', '"', '\'',
            '...', '<url>', '*', '@usuario'
        ])
        self.N = N
        self.vocab_max = vocab_max
        self.unk = '<unk>'
        self.sos = '<s>'
        self.eos = '</s>'
        self.embeddings_model = embeddings_model

    def get_vocabulary_size(self) -> int:
        return len(self.vocabulary)

    def default_tokenize(self, doc: str) -> list:
        return doc.split("  ")

    def remove_word(self, word: str) -> bool:
        word = word.lower()
        is_punct = word in self.punct
        is_digit = word.isnumeric()
        return is_punct or is_digit

    def sortFreqDisct(self, freq_dist) -> list:
        freq_dict = dict(freq_dist)
        return sorted(freq_dict, key=freq_dict.get, reverse=True)

    def get_vocabulary(self, corpus: list) -> set:
        freq_dist = FreqDist([
            letter.lower() for sentence in corpus for letter in sentence
            if not self.remove_word(letter)
        ])
        sorted_words = self.sortFreqDisct(freq_dist)
        return set(sorted_words)

    def fit(self, corpus: list) -> None:
        self.vocabulary = self.get_vocabulary(corpus)
        self.vocabulary.add(self.unk)
        self.vocabulary.add(self.sos)
        self.vocabulary.add(self.eos)
        self.word_index = {}
        self.index_word = {}
        if self.embeddings_model is not None:
            self.embeddings_matrix = empty(
                [self.get_vocabulary_size, self.embeddings_model.vector_size])
        self.make_data(corpus)

    def make_data(self, corpus: str) -> tuple:
        id = 0
        for doc in corpus:
            for word in doc:
                word = word.lower()
                if word in self.vocabulary and not word in self.word_index:
                    self.word_index[word] = id
                    self.index_word[id] = word
                    if self.embeddings_model is not None:
                        if word in self.embeddings_model:
                            self.embedding_matrix[id] = self.embeddings_model[
                                word]
                    id += 1
        # Always add special tokens
        self.word_index.update({
            self.unk: id,
            self.sos: id + 1,
            self.eos: id + 2
        })
        self.index_word.update({
            id: self.unk,
            id + 1: self.sos,
            id + 2: self.eos
        })

    def get_ngram_doc(self, doc: str) -> list:
        doc_tokens = list(doc)
        doc_tokens = self.replace_unk(doc_tokens)
        doc_tokens = [word.lower() for word in doc_tokens]
        doc_tokens = [self.sos] * (self.N - 1) + doc_tokens + [self.eos]
        return list(ngrams(doc_tokens, self.N))

    def replace_unk(self, doc_tokens: list) -> list:
        for i, token in enumerate(doc_tokens):
            if token.lower() not in self.vocabulary:
                doc_tokens[i] = self.unk
        return doc_tokens

    def transform(self, corpus: list) -> tuple:
        X_ngrams = []
        y = []
        for doc in corpus:
            doc_ngram = self.get_ngram_doc(doc)
            for words_window in doc_ngram:
                words_window_ids = [
                    self.word_index[word] for word in words_window
                ]
                X_ngrams.append(list(words_window_ids[:-1]))
                y.append(words_window_ids[-1])
        return array(X_ngrams), array(y)

## models

In [4]:
from numpy import array, mean, asanyarray, sum, exp, argmax, log
from torch.utils.data import DataLoader, TensorDataset
from nltk.tokenize import TweetTokenizer as tokenizer
from sklearn.metrics import accuracy_score
from pandas import read_csv, DataFrame
from numpy.random import multinomial
from itertools import permutations
import torch.nn.functional as F
from argparse import Namespace
from tabulate import tabulate
from shutil import copyfile
from random import shuffle
from os.path import join
import torch.nn as nn
import torch
import time


class Mex_data_class:

    def __init__(self, params: dict, args: Namespace) -> None:
        self.params = params
        self.args = args
        self.read()

    def read(self) -> None:
        """
        Lectura de los archivos de datos a partir de su ruta y nombre de archivo
        """
        train_filename = join(self.params["path data"],
                              self.params["train data"])
        validation_filename = join(self.params["path data"],
                                   self.params["train data"])
        self.train_text = self.read_file(train_filename)
        self.validation_text = self.read_file(validation_filename)

    def read_file(self, filename: str) -> list:
        data = read_csv(filename, engine="python", sep="\r\n", header=None)
        data = list(data[0])
        return data

    def obtain_data_and_labels(self, ngram: ngram_model) -> None:
        self.train_data, self.train_labels = ngram.transform(self.train_text)
        self.validation_data, self.validation_labels = ngram.transform(
            self.validation_text)

    def obtain_loaders(self) -> None:
        self.train_loader = obtain_loader(self.train_data, self.train_labels,
                                          self.args)
        self.validation_loader = obtain_loader(self.validation_data,
                                               self.validation_labels,
                                               self.args)


class neural_language_model(nn.Module):

    def __init__(self, args, embeddings=None) -> None:
        super(neural_language_model, self).__init__()
        self.window_size = args.N - 1
        self.embeding_size = args.d
        self.emb = nn.Embedding(args.vocabulary_size, args.d)
        self.fc1 = nn.Linear(args.d * (args.N - 1), args.d_h)
        self.drop1 = nn.Dropout(p=args.dropout)
        self.fc2 = nn.Linear(args.d_h, args.vocabulary_size, bias=False)
        self.args = args

    def forward(self, x):
        x = self.emb(x)
        x = x.view(-1, self.window_size * self.embeding_size)
        h = F.relu(self.fc1(x))
        h = self.drop1(h)
        return self.fc2(h)

    def read_model(self, path: str, name: str) -> None:
        filename = join(path, name)
        if torch.cuda.is_available():
            self.load_state_dict(torch.load(filename)["state_dict"])
        else:
            self.load_state_dict(
                torch.load(filename,
                           map_location=torch.device('cpu'))["state_dict"])
        self.train(False)


class model_class:

    def __init__(self, model: neural_language_model, args: Namespace,
                 train_loader, validation_loader):
        self.validation_loader = validation_loader
        self.train_loader = train_loader
        self.model = model
        self.args = args

    def get_pred(self, raw_logits):
        probs = F.softmax(raw_logits.detach(), dim=1)
        y_pred = torch.argmax(probs, dim=1).cpu().numpy()
        return y_pred

    def model_eval(self, data):
        with torch.no_grad():
            preds = []
            tgts = []
            for window_words, labels in data:
                if self.args.use_gpu:
                    window_words = window_words.cuda()
                outputs = self.model(window_words)
                # Get prediction
                y_pred = self.get_pred(outputs)
                tgt = labels.numpy()
                tgts.append(tgt)
                preds.append(y_pred)
        tgts = [e for l in tgts for e in l]
        preds = [e for l in preds for e in l]
        return accuracy_score(tgts, preds)

    def save_checkpoint(self,
                        state,
                        is_best: bool,
                        checkpoint_path: str,
                        filename: str = 'checkpoint.pt',
                        best_model_name: str = 'model_best.pt') -> DataFrame:
        print(checkpoint_path, filename)
        name = join(checkpoint_path, filename)
        torch.save(state, name)
        if is_best:
            filename_best = join(checkpoint_path, best_model_name)
            copyfile(name, filename_best)

    def run(self):
        stadistics = DataFrame(
            columns=["Train acc", "Loss", "Val acc", "Time"])
        start_time = time.time()
        best_metric = 0
        metric_history = []
        train_metric_history = []
        criterion, optimizer, scheduler = init_models_parameters(
            self.model, self.args)
        for epoch in range(self.args.num_epochs):
            epoch_start_time = time.time()
            loss_epoch = []
            training_metric = []
            self.model.train()
            for window_words, labels in self.train_loader:
                # If GPU available
                if self.args.use_gpu:
                    window_words = window_words.cuda()
                    labels = labels.cuda()
                # Forward pass
                outputs = self.model(window_words)
                loss = criterion(outputs, labels)
                loss_epoch.append(loss.item())
                # Get Trainning Metrics
                y_pred = self.get_pred(outputs)
                tgt = labels.cpu().numpy()
                training_metric.append(accuracy_score(tgt, y_pred))
                # Backward and Optimize
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            # Get Metric in Trainning Dataset
            mean_epoch_metric = mean(training_metric)
            train_metric_history.append(mean_epoch_metric)
            # Get Metric in Validation Dataset
            self.model.eval()
            tuning_metric = self.model_eval(self.validation_loader)
            metric_history.append(mean_epoch_metric)
            # Update Scheduler
            scheduler.step(tuning_metric)
            # Check for Metric Improvement
            is_improvement = tuning_metric > best_metric
            if is_improvement:
                best_metric = tuning_metric
                n_no_improve = 0
            else:
                n_no_improve += 1
            # Save best model if metric improved
            state = {
                'epoch': epoch + 1,
                'state_dict': self.model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler.state_dict(),
                'best_metric': best_metric,
            }
            self.save_checkpoint(
                state,
                is_improvement,
                self.args.savedir,
            )
            # Early stopping
            if n_no_improve >= self.args.patience:
                print('No improvement. Breaking out of loop')
                break
            finish_time = time.time() - epoch_start_time
            stadistics.loc[epoch + 1] = [
                mean_epoch_metric,
                mean(loss_epoch), tuning_metric, finish_time
            ]
            print('Train acc: {}'.format(mean_epoch_metric))
            print(
                'Epoch[{}/{}], Loss : {:4f} - Val accuracy: {:4f} - Epoch time: {:2f}'
                .format(epoch + 1, self.args.num_epochs, mean(loss_epoch),
                        tuning_metric, finish_time))
            print('--- %s seconds ---' % (time.time() - start_time))
        return stadistics


class generate_text_class:

    def __init__(self, ngram_data: ngram_model, model: neural_language_model,
                 tokenize: tokenizer) -> None:
        self.ngram_data = ngram_data
        self.tokenize = tokenize
        self.model = model

    def parse_text(self, text: str) -> tuple:
        tokens = self.tokenize(text)
        all_tokens = []
        for word in tokens:
            if word == self.ngram_data.sos:
                all_tokens += [word]
                all_tokens += [" "]
        # División entre dos  porque se estan añadiendo dos elementos por <s> encontrado
        n = len(all_tokens) // 2
        sentence = " ".join(tokens[n:])
        all_tokens += [
            letter.lower()
            if letter in self.ngram_data.word_index else self.ngram_data.unk
            for letter in sentence
        ]
        tokens_id = [
            self.ngram_data.word_index[letter] for letter in all_tokens
        ]
        return all_tokens, tokens_id

    def sample_next_word(self, logits: array, temperature: float) -> int:
        logits = asanyarray(logits).astype("float64")
        preds = logits / temperature
        exp_preds = exp(preds)
        preds = exp_preds / sum(exp_preds)
        probability = multinomial(1, preds)
        return argmax(probability)

    def predict_next_token(self, tokens_id: list) -> int:
        word_index_tensor = torch.LongTensor(tokens_id).unsqueeze(0)
        y_raw_predict = self.model(word_index_tensor).squeeze(
            0).detach().numpy()
        y_pred = self.sample_next_word(y_raw_predict, 1.0)
        return y_pred

    def run(self, initial_text: str):
        tokens, window_word_index = self.parse_text(initial_text)
        for i in range(300):
            y_pred = self.predict_next_token(window_word_index)
            next_word = self.ngram_data.index_word[y_pred]
            tokens.append(next_word)
            if next_word == self.ngram_data.eos:
                break
            else:
                window_word_index.pop(0)
                window_word_index.append(y_pred)
        return "".join(tokens)


def init_models_parameters(model: neural_language_model,
                           args: Namespace) -> tuple:
    args.use_gpu = torch.cuda.is_available()
    if args.use_gpu:
        model.cuda()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        "min",
        patience=args.lr_patience,
        verbose=True,
        factor=args.lr_factor)
    return criterion, optimizer, scheduler


def print_closet_words(embeddings, ngram_data, word, n) -> None:
    word_id = torch.LongTensor([ngram_data.word_index[word]])
    word_embed = embeddings(word_id)
    # Compute distances to all words
    dist = torch.norm(embeddings.weight - word_embed, dim=1).detach()
    lst = sorted(enumerate(dist.numpy()), key=lambda x: x[1])
    table = []
    for idx, difference in lst[1:n + 1]:
        table += [[ngram_data.index_word[idx], difference]]
    print(tabulate(table, headers=["Word", "Difference"]))


def obtain_loader(data: array, labels: array, args: Namespace) -> DataLoader:
    dataset = TensorDataset(torch.tensor(data, dtype=torch.int64),
                            torch.tensor(labels, dtype=torch.int64))
    loader = DataLoader(dataset,
                        batch_size=args.batch_size,
                        num_workers=args.num_workers,
                        shuffle=True)
    return loader


def log_likelihood(model: neural_language_model, text: str,
                   ngram_data: ngram_model) -> float:
    x, y = ngram_data.transform(text)
    x, y = x[2:], y[2:]
    x = torch.LongTensor(x).unsqueeze(0)
    logits = model(x).detach()
    probability = F.softmax(logits, dim=1).numpy()
    return sum(log([probability[i][w] for i, w in enumerate(y)]))


def perplexity(model: neural_language_model, text: str,
               ngram_data: ngram_model) -> float:
    perplexity_value = log_likelihood(model, text, ngram_data)
    perplexity_value = -perplexity_value / len(text)
    return perplexity_value


def syntax_structure(model: neural_language_model, ngram_data: ngram_model,
                     word: str) -> None:
    perms = ["".join(perm) for perm in permutations(word)]
    best_log_likelihood = [(log_likelihood(model, pharse, ngram_data), pharse)
                           for pharse in perms]
    best_log_likelihood = sorted(best_log_likelihood, reverse=True)
    headers = ["Palabra", "Perplejidad"]
    print("-" * 40)
    results = []
    for p, i in best_log_likelihood[:5]:
        results += [[i, p]]
    print(tabulate(results, headers=headers))
    print("-" * 40)
    results = []
    for p, i in best_log_likelihood[-5:]:
        results += [[i, p]]
    print(tabulate(results, headers=headers))


def save_stadistics(params: dict, stadistics: DataFrame) -> None:
    filename = join(params["path data"], params["stadistics  file"])
    stadistics.index.name = "Epoch"
    stadistics.to_csv(filename)

### Inicialización de los modelos

In [5]:
from nltk.tokenize import TweetTokenizer as tokenizer
# Semillas de las funciones aleatorias
init_seeds()
# Recoleccion de los parametros y argumentos
params = get_params()
args = get_args()
# Definicion del tokenizer
tokenize = tokenizer().tokenize
print("Lectura de archivos")
# Lectura de los datos
mex_data = Mex_data_class(params, args)
# Inicializacion del modelo de ngramas
ngram = ngram_model(args.N, tokenize=tokenize)
ngram.fit(mex_data.train_text)
# Argumento del tamaño del vocabulario
args.vocabulary_size = ngram.get_vocabulary_size()
# Estructuración de los datos para la red neuronal
mex_data.obtain_data_and_labels(ngram)
mex_data.obtain_loaders()

Lectura de archivos


In [6]:
# Inicializacion de la red neuronal
neural_model = neural_language_model(args)
# Inicializacion del modelo de prediccion
model = model_class(neural_model, args, mex_data.train_loader,
                    mex_data.validation_loader)
# Entrenamiento de la neurona
# stadistics=model.run()
# Guardado de las estadisticas de entrenamiento
# save_stadistics(params,stadistics)
# Lectura de los parametros de la red neuronal
neural_model.read_model(params["path model"], params["file model"])

In [9]:
generate_text = generate_text_class(ngram, neural_model, tokenize)
print("-" * 40)
print("Primer palabra")
print(generate_text.run("<s> hol"))
print("-" * 40)
print("Segunda palabra")
print(generate_text.run("corre"))
print("-" * 40)
print("Tercera palabra")
print(generate_text.run("<s> <s> c"))

----------------------------------------
Primer palabra
<s> holas puta pendeja en lo profertero de la verga<unk> cuando decirá paco caras <unk> conocubir a su madre #tuxglo</s>
----------------------------------------
Segunda palabra
correna la primeran de mi capaz valer 😭💞osa loca de la mierda sus respeta estan</s>
----------------------------------------
Tercera palabra
<s> <s> como me tamper pues que jajajajajajajaja loca<unk></s>


## Punto 2
Escriba 5 ejemplos de oraciones y mídales el likelihood

In [10]:
print("log likelihood",
      log_likelihood(neural_model, "Dejalo que termine", ngram))

log likelihood -230.42316


In [11]:
print(
    "log likelihood",
    log_likelihood(neural_model,
                   "esperate a que tenga servicios, ya completos", ngram))

log likelihood -567.6274


In [12]:
print("log likelihood",
      log_likelihood(neural_model, "asi te ganas un chingo de gente", ngram))

log likelihood -412.32114


In [13]:
print(
    "log likelihood",
    log_likelihood(neural_model, "eso que esten en redes con sus criticas",
                   ngram))

log likelihood -527.38086


In [14]:
print(
    "log likelihood",
    log_likelihood(neural_model, "unas tlayudas no le hacen daño a nadie",
                   ngram))

log likelihood -492.78888


## Punto 3
Escriba un ejemplo de estructura morfológica (permutaciones con caracteres) similar al de estructura sintáctica del profesor con 5 o más caracteres de su gusto (e.g., "ando ")

In [15]:
word = "enojada"
syntax_structure(neural_model, ngram, word)

----------------------------------------
Palabra      Perplejidad
---------  -------------
jonedaa          -66.821
jonedaa          -66.821
joneada          -66.821
joneada          -66.821
joneaad          -66.821
----------------------------------------
Palabra      Perplejidad
---------  -------------
aadjoen         -70.3344
aadjnoe         -70.3344
aadjnoe         -70.3344
aadjneo         -70.3344
aadjneo         -70.3344


## Punto 4
Calcule la perplejidad del modelo sobre los datos val.

In [16]:
perplexity(neural_model, mex_data.validation_text, ngram)

133.86564303751803