In [1]:
import os
import gc

import pandas as pd
import numpy as np

import tensorflow as tf


import matplotlib.pyplot as plt
import seaborn as sns

from transformers import AutoTokenizer, AutoModel

import torch

##### Declaração das constantes utilizadas no código

In [2]:
DATASET_PATH = "dataset"
CONTEXT_TEXT_PATH = os.path.join(DATASET_PATH,"text_data")

S08_QA = ["S08_question_answer_pairs.txt"]
S09_QA = ["S09_question_answer_pairs.txt"]
S10_QA = ["S10_question_answer_pairs.txt"]
ALL_QA = S08_QA + S09_QA + S10_QA

PUNCTUATIONS_TO_REMOVE = [".", "!", "?"]
BAD_ANSWER_TO_REMOVE = ["blah","blah blah blah","(What?)"]


In [3]:
# carrega conteúdo dos arquivos com o texto utilizado para responder as perguntas
def get_context_files():
    context_file_map = {}
    for f in os.listdir(CONTEXT_TEXT_PATH):
        file_full_path = os.path.join(CONTEXT_TEXT_PATH, f)
        if os.path.isfile(file_full_path):
            with open(file_full_path, 'r', encoding="utf-8") as f_contexto:
                contexto = f_contexto.read().replace('\n', '')
            chave = f.replace(".txt.clean","")
            context_file_map[chave] = contexto
    return context_file_map
    
#context_files = get_context_files()
#  lista_tamanhos = []
#  for cf in context_files:
#       lista_tamanhos.append(len(context_files[cf].split()))
#   print(lista_tamanhos[:5])


In [4]:
# carrega o pandas com os grupos especificados. Os grupos de perguntas e respostas foram declarados como constantes
def load_qa_pairs(qa_source=ALL_QA):
    df_list = []
    for qa_file in qa_source:
        df= pd.read_csv(os.path.join(DATASET_PATH,qa_file),sep='\t')
        df_list.append(df)
    df_concat = pd.concat(df_list)
    return df_concat

In [5]:
df_inicial = load_qa_pairs(qa_source=S08_QA)

## Análise e Preparação das Perguntas/Respostas

##### Análise inicial, somente com as primeiras 10 linhas:
- Case diversos - deve ser tratado.
- Algumas respostas tem pontuação - deve ser tratado.
- Algumas perguntas não incluem o contexto - Did his mother die of pneumonia?. Validar se quando o title não aparecer no texto ele deve ser incluído.

In [6]:
df_inicial.head(10)

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,S08_set3_a4
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.,easy,easy,S08_set3_a4
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,S08_set3_a4
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.,easy,easy,S08_set3_a4
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,S08_set3_a4
5,Abraham_Lincoln,Did his mother die of pneumonia?,No.,easy,easy,S08_set3_a4
6,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months,medium,easy,S08_set3_a4
7,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months.,medium,medium,S08_set3_a4
8,Abraham_Lincoln,When did Lincoln begin his political career?,1832,medium,easy,S08_set3_a4
9,Abraham_Lincoln,When did Lincoln begin his political career?,1832.,medium,medium,S08_set3_a4


##### Missing Values
- Retirar perguntas nulas
- Retirar respostas nulas

In [7]:
print(df_inicial.isna().sum())
print(df_inicial.shape)

ArticleTitle                  0
Question                     19
Answer                      240
DifficultyFromQuestioner    491
DifficultyFromAnswerer      242
ArticleFile                   2
dtype: int64
(1715, 6)


In [8]:
#verificar se existe alguma inconsistencia como resposta para pergunta nual
df_inicial[(df_inicial.Question.isna()) &
           (df_inicial.Answer.notna())]

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile


In [9]:
df_inicial[df_inicial.ArticleFile.isna()]

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
573,Gerald_Ford,Was Ford active about Vietnamese affairs?,No,,,
574,\n,hard,hard,S08_set3_a10,,


In [10]:
# retirando missing values
df_preparado = df_inicial[(df_inicial.Question.notna()) &
                          (df_inicial.Answer.notna())]
print(df_preparado.shape)

(1475, 6)


In [11]:
df_preparado.head(50)

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,S08_set3_a4
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.,easy,easy,S08_set3_a4
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,S08_set3_a4
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.,easy,easy,S08_set3_a4
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,S08_set3_a4
5,Abraham_Lincoln,Did his mother die of pneumonia?,No.,easy,easy,S08_set3_a4
6,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months,medium,easy,S08_set3_a4
7,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months.,medium,medium,S08_set3_a4
8,Abraham_Lincoln,When did Lincoln begin his political career?,1832,medium,easy,S08_set3_a4
9,Abraham_Lincoln,When did Lincoln begin his political career?,1832.,medium,medium,S08_set3_a4


In [12]:
df_preparado.Answer.value_counts()

Yes                                                                          219
yes                                                                          211
no                                                                            58
No                                                                            57
Yes.                                                                          52
                                                                            ... 
Garbage is now recycled or transported to Thompson, Manitoba.                  1
Qatar                                                                          1
his biological father was abusive and had a history of hitting his mother      1
The New Kingdom                                                                1
founder and first president                                                    1
Name: Answer, Length: 763, dtype: int64

#### Preparação dos dados

Questões para tratamento
- Lowercase
- Pontuação da resposta, ultimo caracter com . ou ! ou ? devem ser retirados.
- Pontuação da pergunta, devemos tirar a interrogação?
- Retirar perguntas duplicadas: Respostas null já forma retiradas no pandas. Escolhe sempre a primeira.
- Remover respostas ruins: blah, (What)?, blah blah blah

In [13]:
# respostas e perguntas para lowercase
# limpeza da resposta: retirar pontuação do último caracter (., ?, !)
def normalize_qa(vet_qa):
    vet_n_resp = []
    for qa in vet_qa:
        pergunta, resposta = qa[0], qa[1]
        pergunta = pergunta.lower()
        
        if (resposta[-1] in PUNCTUATIONS_TO_REMOVE):
            resposta = resposta[:-1]
        vet_n_resp.append([pergunta, resposta])
    return np.array(vet_n_resp, dtype=object)


In [14]:
# inicialmente pensada, mas não utilizada. Retorna as perguntas duplicada.
def get_pergunta_igual_from_position(pos, vet, pergunta):
    vet_resp = []
    for index in range(pos, len(vet)):
        if vet[index][0] == pergunta:
            vet_resp.append(vet[index])
    return vet_resp
        

In [15]:
# Remove duplicações de respostas, como as respostas nulas já foram retiradas então escolhe semper a primeira resposta encontrada
def remove_duplication(vet_qa):
    pergunta_set = set()
    vet_qa_clean = []
    
    for pos,qa in enumerate(vet_qa):
        pergunta, resposta = qa[0], qa[1]
        if pergunta in pergunta_set:
            continue
        #perguntas_duplicadas = get_pergunta_igual_from_position(pos+1, vet_qa, pergunta)
        pergunta_set.add(pergunta)
        vet_qa_clean.append(qa)
    return np.array(vet_qa_clean, dtype=object)            
    

In [16]:
# Remove as perguntas consideradas ruins, exemplo: blah, (What!)
def remove_bad_answer(vet_qa):
    vet_qa_clean = []
    for qa in vet_qa:
        pergunta, resposta = qa[0], qa[1]
        if resposta in BAD_ANSWER_TO_REMOVE:
            continue
        vet_qa_clean.append(qa)
    return np.array(vet_qa_clean, dtype=object) 
    

---

##### Carrega o vetor de perguntas e respostas realizando os tratamentos iniciais

In [17]:
vet_QA = df_preparado[["Question", "Answer"]].to_numpy()
vet_QA = normalize_qa(vet_QA)
print('Antes remoção duplicadas', vet_QA.shape)
vet_QA = remove_bad_answer(vet_QA)
print('Após remoção bad answers', vet_QA.shape)
vet_QA = remove_duplication(vet_QA)
print('Após remoção duplicadas', vet_QA.shape)

max_length = 0
for qa in vet_QA:
    if len(qa[0]) > max_length:
        max_length = len(qa[0])
    if len(qa[1]) > max_length:
        max_length = len(qa[1])
print (max_length)

Antes remoção duplicadas (1475, 2)
Após remoção bad answers (1472, 2)
Após remoção duplicadas (915, 2)
423


---

## Primeiro Teste - Sentece Similarity sem Fine Tunning

In [18]:
vet_pergunta = vet_QA[:,0]
vet_resposta = vet_QA[:,1]


In [None]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model.to('cuda')

In [20]:
resposta_inputs = tokenizer.batch_encode_plus(vet_resposta.tolist(), 
                                             return_tensors='pt',
                                             padding='max_length',
                                             max_length=128)
resposta_input_ids = resposta_inputs['input_ids']
resposta_input_ids = resposta_input_ids.to('cuda')
resposta_attention_mask = resposta_inputs['attention_mask']
resposta_attention_mask = resposta_attention_mask.to('cuda')

In [21]:
resposta_model_output = model(resposta_input_ids, attention_mask=resposta_attention_mask)

RuntimeError: CUDA out of memory. Tried to allocate 1.34 GiB (GPU 0; 8.00 GiB total capacity; 5.78 GiB already allocated; 446.12 MiB free; 5.84 GiB reserved in total by PyTorch)

In [24]:
del model
resposta_input_ids = None
resposta_attention_mask = None
gc.collect()
torch.cuda.empty_cache()

In [26]:
resposta_model_output [1]

tensor([[-0.7194, -0.1823,  0.8919,  ...,  0.0927, -0.4348,  0.9008],
        [-0.7194, -0.1823,  0.8919,  ...,  0.0927, -0.4348,  0.9008],
        [-0.6540, -0.4246, -0.8595,  ..., -0.6584, -0.2318,  0.8663],
        ...,
        [-0.7307, -0.0420, -0.0974,  ..., -0.2896, -0.2291,  0.4107],
        [-0.7194, -0.1823,  0.8919,  ...,  0.0927, -0.4348,  0.9008],
        [-0.6540, -0.4246, -0.8595,  ..., -0.6584, -0.2318,  0.8663]],
       device='cuda:0', grad_fn=<TanhBackward>)

In [25]:
reposta_model_output = None
torch.cuda.empty_cache()

17