In [2]:
import os
import gc

import pandas as pd
import numpy as np

import tensorflow as tf


import matplotlib.pyplot as plt
import seaborn as sns

from transformers import AutoTokenizer, AutoModel

import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import functional as t_funcional

# Arquivos Pythons com funções e constantes
from qs_constants import AppConstants
from qs_functions import load_qa_pairs, normalize_qa, remove_duplication, remove_bad_answer


## Análise e Preparação das Perguntas/Respostas

##### Análise inicial, somente com as primeiras 10 linhas:
- Case diversos - deve ser tratado.
- Algumas respostas tem pontuação - deve ser tratado.
- Algumas perguntas não incluem o contexto - Did his mother die of pneumonia?. Validar se quando o title não aparecer no texto ele deve ser incluído.

In [3]:
df_inicial = load_qa_pairs(qa_source=AppConstants.S08_QA)

In [4]:
df_inicial.head(10)

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,S08_set3_a4
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.,easy,easy,S08_set3_a4
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,S08_set3_a4
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.,easy,easy,S08_set3_a4
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,S08_set3_a4
5,Abraham_Lincoln,Did his mother die of pneumonia?,No.,easy,easy,S08_set3_a4
6,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months,medium,easy,S08_set3_a4
7,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months.,medium,medium,S08_set3_a4
8,Abraham_Lincoln,When did Lincoln begin his political career?,1832,medium,easy,S08_set3_a4
9,Abraham_Lincoln,When did Lincoln begin his political career?,1832.,medium,medium,S08_set3_a4


##### Missing Values
- Retirar perguntas nulas
- Retirar respostas nulas

In [5]:
print(df_inicial.isna().sum())
print(df_inicial.shape)

ArticleTitle                  0
Question                     19
Answer                      240
DifficultyFromQuestioner    491
DifficultyFromAnswerer      242
ArticleFile                   2
dtype: int64
(1715, 6)


In [6]:
#verificar se existe alguma inconsistencia como resposta para pergunta nual
df_inicial[(df_inicial.Question.isna()) &
           (df_inicial.Answer.notna())]

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile


In [7]:
df_inicial[df_inicial.ArticleFile.isna()]

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
573,Gerald_Ford,Was Ford active about Vietnamese affairs?,No,,,
574,\n,hard,hard,S08_set3_a10,,


In [8]:
# retirando missing values
df_preparado = df_inicial[(df_inicial.Question.notna()) &
                          (df_inicial.Answer.notna())]
print(df_preparado.shape)

(1475, 6)


In [9]:
df_preparado.head(50)

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,S08_set3_a4
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.,easy,easy,S08_set3_a4
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,S08_set3_a4
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.,easy,easy,S08_set3_a4
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,S08_set3_a4
5,Abraham_Lincoln,Did his mother die of pneumonia?,No.,easy,easy,S08_set3_a4
6,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months,medium,easy,S08_set3_a4
7,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months.,medium,medium,S08_set3_a4
8,Abraham_Lincoln,When did Lincoln begin his political career?,1832,medium,easy,S08_set3_a4
9,Abraham_Lincoln,When did Lincoln begin his political career?,1832.,medium,medium,S08_set3_a4


In [10]:
df_preparado.Answer.value_counts()

Yes                                                                                                                                     219
yes                                                                                                                                     211
no                                                                                                                                       58
No                                                                                                                                       57
Yes.                                                                                                                                     52
                                                                                                                                       ... 
being farmed for their meat, eggs, feathers, (particularly their down)                                                                    1
He fell in love with

#### Preparação dos dados

Questões para tratamento
- Lowercase
- Pontuação da resposta, ultimo caracter com . ou ! ou ? devem ser retirados.
- Pontuação da pergunta, devemos tirar a interrogação?
- Retirar perguntas duplicadas: Respostas null já forma retiradas no pandas. Escolhe sempre a primeira.
- Remover respostas ruins: blah, (What)?, blah blah blah

---

##### Carrega o vetor de perguntas e respostas realizando os tratamentos iniciais

In [11]:
vet_QA = df_preparado[["Question", "Answer", "ArticleTitle","ArticleFile"]].to_numpy()
vet_QA = normalize_qa(vet_QA)
print('normalizado', vet_QA.shape)
vet_QA = remove_bad_answer(vet_QA)
print('Após remoção bad answers', vet_QA.shape)
vet_QA = remove_duplication(vet_QA)
print('Após remoção duplicadas', vet_QA.shape)

max_length = 0
for qa in vet_QA:
    if len(qa[0]) > max_length:
        max_length = len(qa[0])
    if len(qa[1]) > max_length:
        max_length = len(qa[1])
print (max_length)

normalizado (1475, 3)
Após remoção bad answers (1472, 3)
Após remoção duplicadas (915, 3)
423


#### Distribuição das Perguntas

In [11]:
dic_tipo_pergunta = {}
for qa in vet_QA:
    str_tipo_pergunta = qa[0].split()[0]
    if str_tipo_pergunta in dic_tipo_pergunta:
        dic_tipo_pergunta[str_tipo_pergunta] = dic_tipo_pergunta[str_tipo_pergunta] + 1
    else:
        dic_tipo_pergunta[str_tipo_pergunta] = 1
for tipo_pergunta in dic_tipo_pergunta:
    print(tipo_pergunta, ' - ' , dic_tipo_pergunta[tipo_pergunta])

was  -  73
did  -  69
how  -  63
when  -  41
what  -  220
who  -  54
which  -  11
why  -  25
do  -  22
is  -  131
can  -  12
the  -  8
had  -  4
are  -  57
oxygen  -  1
have  -  12
may  -  1
of  -  1
where  -  32
in  -  6
european  -  1
does  -  28
during  -  1
since  -  1
has  -  12
were  -  2
name  -  3
today,  -  1
according  -  1
for  -  1
hard  -  1
to  -  2
forward,  -  1
he  -  2
four  -  1
with  -  1
adams  -  1
different  -  1
felis  -  1
an  -  1
sea  -  1
a  -  1
garbage  -  1
qatar  -  1
will  -  1
sibiu,  -  1
if  -  1
these  -  1
grant  -  1


---

### Funções para carregamento dos dados. As variáveis globais foram utilizadas somente para os estudos iniciais
#### Carrega o dataset e aplica limpezas e tratamentos necessários. Pode evoluir sem estar ligado aos testes e manipulações do inicio do notebook. 

In [9]:
def load_dataset(source):
    df_inicial = load_qa_pairs(source)
    df_preparado = df_inicial[(df_inicial.Question.notna()) &
                          (df_inicial.Answer.notna())]
    vet_QA = df_preparado[["Question", "Answer", "ArticleTitle","ArticleFile"]].to_numpy()
    vet_QA = normalize_qa(vet_QA)
    vet_QA = remove_bad_answer(vet_QA)
    vet_QA = remove_duplication(vet_QA)
    
    vet_pergunta = vet_QA[:,0]
    vet_resposta = vet_QA[:,1]
    
    return vet_pergunta, vet_resposta


---

## Primeiro Teste - Sentece Similarity sem Fine Tunning

##### Funções

In [10]:
# criação modelo, caminho Hugginface recebido por parametro
def create_model(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path)
    model.to('cuda')
    return model, tokenizer

In [11]:
# liberar espaço ocupado pelo modelo na GPU
def destroy_model(model):
    del model
    gc.collect()
    torch.cuda.empty_cache()

##### Prepara o dataset para entrar no modelo bert. Gera input_id, attention_mask e monta o data loaader para processamento em mini-batchs
- *Vetor de perguntas e respostas - Numpy*


In [12]:
def prepare_to_bert(np_sentences, tokenizer, sentence_max_length=450, batch_size=5):
    inputs = tokenizer.batch_encode_plus(np_sentences.tolist(), 
                                         return_tensors='pt',
                                         padding='max_length',
                                         max_length=sentence_max_length)
    
    input_ids = inputs['input_ids']
    input_ids = input_ids.to('cuda')
    attention_mask = inputs['attention_mask']
    attention_mask = attention_mask.to('cuda')
    
    dataset = TensorDataset(input_ids, attention_mask)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    return loader   
    

##### Realiza o encode da sentença preparada e previamente armazenada em um DataLoader.
*Retorna dois o encoding da setença com mean e mean polling*

In [32]:
def encode_sentence(np_sentences, bert_model, bert_tokenizer):
    sentence_loader = prepare_to_bert(np_sentences, bert_tokenizer, sentence_max_length=450, batch_size=5)

    mean_pooled_vet = []
    mean_vet = []
    for batch_id, (input_ids, attention_masks) in enumerate(sentence_loader):
        output = bert_model(input_ids, attention_mask=attention_masks)
        print('lote ', batch_id,' - finalizado', end='\r')    

        embeddings = output.last_hidden_state
        embeddings.detach()
        
        return embeddings

        # calculo mean_pooling
        mask = attention_masks.unsqueeze(-1).expand(embeddings.size()).float()
        masked_embeddings = embeddings * mask
        summed = torch.sum(masked_embeddings, 1)
        summed_mask = torch.clamp(mask.sum(1), min=1e-9)
        mean_pooled = summed / summed_mask
        # convert from PyTorch tensor to numpy array
        mean_pooled = mean_pooled.cpu().data.numpy()
        mean_pooled_vet.append(mean_pooled)

        #calculo mean
        mean = embeddings.mean(dim=1)
        mean = mean.cpu().data.numpy()
        mean_vet.append(mean)

        #--- libera recursos da gpu
        del output
        gc.collect()
        torch.cuda.empty_cache()
    
    mean_pooled_vet = np.concatenate(mean_pooled_vet)
    mean_vet = np.concatenate(mean_vet)
    
    return mean_pooled_vet, mean_vet

#### Execução

In [31]:
vet_pergunta, vet_resposta = load_dataset(source=AppConstants.S08_QA)

model_sbert_sem_finetunnig, tokenizer = create_model(MODEL_BASE_NLI_MEAN)

embeddings = encode_sentence(vet_pergunta, bert_model=model_sbert_sem_finetunnig, bert_tokenizer=tokenizer)

print('Encode Perguntas')
%time mean_pooled_pergunta, mean_pergunta = encode_sentence(vet_pergunta, bert_model=model_sbert_sem_finetunnig, bert_tokenizer=tokenizer)

print('Encode Respostas')
%time mean_pooled_resposta, mean_resposta = encode_sentence(vet_resposta, bert_model=model_sbert_sem_finetunnig, bert_tokenizer=tokenizer)

destroy_model(model_sbert_sem_finetunnig)

Encode Perguntasalizado
Wall time: 39.6 slizado
Encode Respostas
Wall time: 40.3 slizado


In [26]:
# teste de shapes
vet_pergunta, vet_resposta = load_dataset(source=S08_QA)
tokenizer = AutoTokenizer.from_pretrained(MODEL_BASE_NLI_MEAN)
inputs = tokenizer.encode_plus((vet_pergunta[0]), 
                                         return_tensors='pt',
                                         padding='max_length',
                                         max_length=20,
                                         truncation=True)

In [27]:
print(inputs.input_ids.shape)
print(inputs.attention_mask.shape)
print(inputs.input_ids)
print(inputs.attention_mask)


torch.Size([1, 20])
torch.Size([1, 20])
tensor([[  101,  2001,  8181,  5367,  1996, 14683,  2343,  1997,  1996,  2142,
          2163,  1029,   102,     0,     0,     0,     0,     0,     0,     0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])


#### Análise da Similaridade
* Temos vários yes e no nas respostas então ele achará indices diferentes, por isso temos que tratar as respostas para medição correta da acurácia

In [41]:
similarities = t_funcional.cosine_similarity(torch.tensor([mean_pooled_pergunta[0]]), torch.tensor(mean_pooled_resposta))
closest = similarities.argsort(descending=True)
for ind in closest:
    print(f'pergunta/resposta: {vet_pergunta[0]}/{vet_resposta[0]} | label encontrado: {vet_resposta[ind]} \t similarity: {similarities[ind]}')

pergunta/resposta: was abraham lincoln the sixteenth president of the united states?/yes | label encontrado: Monroe was elected president in the election of 1816, and re-elected in 1820 	 similarity: 0.5749715566635132
pergunta/resposta: was abraham lincoln the sixteenth president of the united states?/yes | label encontrado: The New Kingdom (c.1550&#8722;1070 BC)  	 similarity: 0.5701117515563965
pergunta/resposta: was abraham lincoln the sixteenth president of the united states?/yes | label encontrado: Lincoln was Roosevelt's presidential hero 	 similarity: 0.5499816536903381
pergunta/resposta: was abraham lincoln the sixteenth president of the united states?/yes | label encontrado: Adams married Abigail Smith 	 similarity: 0.5299426913261414
pergunta/resposta: was abraham lincoln the sixteenth president of the united states?/yes | label encontrado: Grover Cleveland 	 similarity: 0.5248123407363892
pergunta/resposta: was abraham lincoln the sixteenth president of the united states?/y