<a href="https://colab.research.google.com/github/danielapavas/Google-QUEST-Q-A-Labeling/blob/main/04_Modelos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Carga de dataset desde Kaggle

**Challenge Google QUEST Q&A Labeling**

El dataset es tomado de la competencia de Kaggle: https://www.kaggle.com/competitions/google-quest-challenge/data

In [7]:
!pip install kaggle



In [8]:
#carga del token de kaggle

from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"danielapavas","key":"68c976b13c9b0f4fda22960f271dabd5"}'}

In [9]:
! mkdir ~/.kaggle

In [10]:
! cp kaggle.json ~/.kaggle/

In [12]:
! chmod 600 ~/.kaggle/kaggle.json

In [11]:
!kaggle competitions download -c google-quest-challenge

Downloading google-quest-challenge.zip to /content
100% 4.85M/4.85M [00:00<00:00, 42.7MB/s]
100% 4.85M/4.85M [00:00<00:00, 42.4MB/s]


In [13]:
!unzip  google-quest-challenge.zip

Archive:  google-quest-challenge.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


# Imports- Librerias

In [14]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.layers import Input, Softmax, GRU, LSTM, RNN, Embedding, Dense, RepeatVector, TimeDistributed, Bidirectional, Concatenate
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt
import seaborn as sns
#import warnings
#import html
#warnings.filterwarnings('ignore')

# Implementación de Modelos





In [15]:
# Crear dataset

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission_dataset = pd.read_csv('sample_submission.csv')

train.shape, test.shape, sample_submission_dataset.shape

((6079, 41), (476, 11), (476, 31))

In [16]:
# definiendo una función para eliminar las palabras vacias
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.remove('no'); stop_words.remove('not'); stop_words.remove('nor')

def stopwrd_removal(sent):
  lst = []
  for wrd in sent.split():
    if wrd not in stop_words:
      lst.append(wrd)
  return " ".join(lst)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [17]:
def text_preprocessor(column, remove_stopwords = False, remove_specialchar = False):
  """pass any column with Text in it from df_train | Note: returns nothing makes inplace changes in df_train"""
  # 1. remove html tags, html urls, replace html comparison operators
  # text = df_train[column].values
  train[column] = [re.sub('<.*?>', ' ', i) for i in train[column].values]
  train[column] = train[column].str.replace('&lt;', '<')\
                                          .str.replace('&gt;', '>')\
                                          .str.replace('&le;', '<=' )\
                                          .str.replace('&ge;', '>=')

  # 2. remove latex i,e., if there is any formulas or latex we have to remove it
  train[column] = [re.sub('\$.*?\$', ' ', i) for i in train[column].values]

  # 3. all lowercase
  train[column] = train[column].str.lower()

  # 4. decontractions
  train[column] = train[column].str.replace("won't", "will not").str.replace("can\'t", "can not").str.replace("n\'t", " not").str.replace("\'re", " are").str.\
                                                replace("\'s", " is").str.replace("\'d", " would").str.replace("\'ll", " will").str.\
                                                replace("\'t", " not").str.replace("\'ve", " have").str.replace("\'m", " am")

  # 5. removing non-english or hebrew characters
  train[column] = [i.encode("ascii", "ignore").decode() for i in train[column].values]

  # 6. remove all special-characters other than alpha-numericals
  if remove_specialchar == True:
    train[column] = [re.sub('[^A-Za-z0-9]+', ' ', i) for i in train[column].values]

  # 8. Stop_word removal
  if remove_stopwords == True:
    train[column] = [stopwrd_removal(i) for i in train[column].values]

  # 9. remove all white-space i.e., \n, \t, and extra_spaces
  train[column] = train[column].str.replace("\n", " ").str.replace("\t", " ").str.rstrip()
  train[column] = [re.sub('  +', ' ', i) for i in train[column].values]


In [18]:
train['clean_title'] = train['question_title']
train['clean_body'] = train['question_body']
train['clean_answer'] = train['answer']
text_preprocessor('clean_title',  remove_stopwords = False, remove_specialchar = False)
text_preprocessor('clean_body',  remove_stopwords = False, remove_specialchar = False)
text_preprocessor('clean_answer',  remove_stopwords = False, remove_specialchar = False)

In [19]:
# Configuración de las funciones de destino
question_tar = [col for col in train.columns
    if col.startswith('question_') and train[col].dtype != 'object']

answer_tar = [col for col in train.columns
    if col.startswith('answer_') and train[col].dtype != 'object']

print("question_tar:", question_tar)
print("answer_tar:", answer_tar)

tar_features = question_tar + answer_tar
len(tar_features)

question_tar: ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer', 'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice', 'question_type_compare', 'question_type_consequence', 'question_type_definition', 'question_type_entity', 'question_type_instructions', 'question_type_procedure', 'question_type_reason_explanation', 'question_type_spelling', 'question_well_written']
answer_tar: ['answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance', 'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 'answer_type_reason_explanation', 'answer_well_written']


30

In [20]:
# División del dataset train_test_split
from sklearn.model_selection import train_test_split

X_train, X_cv, y_train, y_cv = train_test_split(train[['clean_title', 'clean_body', 'clean_answer']], train[tar_features], test_size = 0.12, random_state = 42)
X_train.shape, X_cv.shape, y_train.shape, y_cv.shape

((5349, 3), (730, 3), (5349, 30), (730, 30))

In [21]:
#Creación de funciones de entrenamiento
title_train = X_train['clean_title'].values
body_train = X_train['clean_body'].values
answer_train = X_train['clean_answer'].values

title_cv = X_cv['clean_title'].values
body_cv = X_cv['clean_body'].values
answer_cv = X_cv['clean_answer'].values

# train data
title_body_train = [i+' '+j for i,j in zip(title_train, body_train)]
y_train_ques = y_train[question_tar].values
y_train_ans = y_train[answer_tar].values

# cv data
title_body_cv = [i+' '+j for i,j in zip(title_cv, body_cv)]
y_cv_ques = y_cv[question_tar].values
y_cv_ans = y_cv[answer_tar].values

len(title_body_train), len(answer_train), len(title_body_cv), len(answer_cv)

(5349, 5349, 730, 730)

# BERT





In [22]:
!pip install transformers



In [23]:
from transformers import BertTokenizer, TFBertModel, BertConfig

# Cargar modelo/tokenizador preentrenado
config = BertConfig.from_pretrained('bert-base-uncased', output_hidden_states=True)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased', config = config)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [24]:
# Tokenización: title_body
title_body_train_tokens = bert_tokenizer.batch_encode_plus(title_body_train, max_length = 512, truncation = True, pad_to_max_length = True, return_tensors="tf")
title_body_cv_tokens = bert_tokenizer.batch_encode_plus(title_body_cv, max_length = 512, truncation = True, pad_to_max_length = True, return_tensors="tf")

tb_train_input_ids  = np.array(title_body_train_tokens['input_ids'])
tb_train_attn_mask = np.array(title_body_train_tokens['attention_mask'])
tb_train_token_typ_ids = np.array(title_body_train_tokens['token_type_ids'])

tb_cv_input_ids  = np.array(title_body_cv_tokens['input_ids'])
tb_cv_attn_mask = np.array(title_body_cv_tokens['attention_mask'])
tb_cv_token_typ_ids = np.array(title_body_cv_tokens['token_type_ids'])

tb_train_input_ids.shape, tb_train_attn_mask.shape, tb_train_token_typ_ids.shape, tb_cv_input_ids.shape, tb_cv_attn_mask.shape, tb_cv_token_typ_ids.shape



((5349, 512), (5349, 512), (5349, 512), (730, 512), (730, 512), (730, 512))

In [25]:
# Tokenización: answer
ans_train_tokens = bert_tokenizer.batch_encode_plus(answer_train, max_length = 512, truncation = True, pad_to_max_length = True, return_tensors="tf")
ans_cv_tokens = bert_tokenizer.batch_encode_plus(answer_cv, max_length = 512, truncation = True, pad_to_max_length = True, return_tensors="tf")

ans_train_input_ids  = np.array(ans_train_tokens['input_ids'])
ans_train_attn_mask = np.array(ans_train_tokens['attention_mask'])
ans_train_token_typ_ids = np.array(ans_train_tokens['token_type_ids'])

ans_cv_input_ids  = np.array(ans_cv_tokens['input_ids'])
ans_cv_attn_mask = np.array(ans_cv_tokens['attention_mask'])
ans_cv_token_typ_ids = np.array(ans_cv_tokens['token_type_ids'])
seq_len = ans_train_input_ids.shape[1]

ans_train_input_ids.shape, ans_train_attn_mask.shape, ans_train_token_typ_ids.shape, ans_cv_input_ids.shape, ans_cv_attn_mask.shape, ans_cv_token_typ_ids.shape

((5349, 512), (5349, 512), (5349, 512), (730, 512), (730, 512), (730, 512))

In [None]:
# train : title_body BERT
batch_size = 32
l = tb_train_input_ids.shape[0]

lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = bert_model([tb_train_input_ids[i*batch_size:(i+1)*batch_size], tb_train_attn_mask[i*batch_size:(i+1)*batch_size], tb_train_token_typ_ids[i*batch_size:(i+1)*batch_size]])[2]

# obtener estados ocultos de las últimas 4 capas (promedio de cada paso de tiempo)
  lst = [tf.reduce_mean(i, axis = 1) for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
tb_BERT_train = tf.concat(lst1, axis = 0)
print(tb_BERT_train.shape)

# cv : title_body BERT
batch_size = 32
l = ans_cv_input_ids.shape[0]

lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = bert_model([tb_cv_input_ids[i*batch_size:(i+1)*batch_size], tb_cv_attn_mask[i*batch_size:(i+1)*batch_size], tb_cv_token_typ_ids[i*batch_size:(i+1)*batch_size]])[2]

# obtener estados ocultos de las últimas 4 capas (promedio de cada paso de tiempo)
  lst = [tf.reduce_mean(i, axis = 1) for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
tb_BERT_cv = tf.concat(lst1, axis = 0)

print(tb_BERT_cv.shape)


  0%|          | 0/168 [00:00<?, ?it/s]

(5349, 3072)


  0%|          | 0/23 [00:00<?, ?it/s]

(730, 3072)


In [None]:
# train : answer BERT
batch_size = 32
l = ans_train_input_ids.shape[0]

lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = bert_model([ans_train_input_ids[i*batch_size:(i+1)*batch_size], ans_train_attn_mask[i*batch_size:(i+1)*batch_size], ans_train_token_typ_ids[i*batch_size:(i+1)*batch_size]])[2]

# obtener estados ocultos de las últimas 4 capas (promedio de cada paso de tiempo)
  lst = [i[:, 0, :] for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
ans_BERT_train = tf.concat(lst1, axis = 0)
print(ans_BERT_train.shape)

# cv : ans BERT
batch_size = 32
l = ans_cv_input_ids.shape[0]
lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = bert_model([ans_cv_input_ids[i*batch_size:(i+1)*batch_size], ans_cv_attn_mask[i*batch_size:(i+1)*batch_size], ans_cv_token_typ_ids[i*batch_size:(i+1)*batch_size]])[2]

# obtener estados ocultos de las últimas 4 capas (promedio de cada paso de tiempo)
  lst = [i[:, 0, :] for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
ans_BERT_cv = tf.concat(lst1, axis = 0)
print(ans_BERT_cv.shape)


  0%|          | 0/168 [00:00<?, ?it/s]

# USE

In [None]:
# Cargando el modelo USE
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

In [None]:
# title_body
tb_USE_train = np.zeros((len(title_body_train), 512))
batch_size = 32
for i in tqdm(range(int(len(title_body_train)/batch_size)+1)):
  tb_USE_train[i*batch_size : (i+1)*batch_size] = use_model(title_body_train[i*batch_size : (i+1)*batch_size]).numpy()
print(tb_USE_train.shape)

tb_USE_cv = np.zeros((len(title_body_cv), 512))
batch_size = 64
for i in tqdm(range(int(len(title_body_cv)/batch_size)+1)):
  tb_USE_cv[i*batch_size : (i+1)*batch_size] = use_model(title_body_cv[i*batch_size : (i+1)*batch_size]).numpy()
print(tb_USE_cv.shape)

In [None]:
# answer
ans_USE_train = np.zeros((len(answer_train), 512))
batch_size = 32
for i in tqdm(range(int(len(answer_train)/batch_size)+1)):
  ans_USE_train[i*batch_size : (i+1)*batch_size] = use_model(answer_train[i*batch_size : (i+1)*batch_size]).numpy()
print(ans_USE_train.shape)

ans_USE_cv = np.zeros((len(answer_cv), 512))
batch_size = 32
for i in tqdm(range(int(len(answer_cv)/batch_size)+1)):
  ans_USE_cv[i*batch_size : (i+1)*batch_size] = use_model(answer_cv[i*batch_size : (i+1)*batch_size]).numpy()
print(ans_USE_cv.shape)

# RoBERTa

In [None]:
from transformers import RobertaConfig, RobertaTokenizer, TFRobertaModel

# Carga de modelo/tokenizador preentrenado
config = RobertaConfig.from_pretrained('roberta-base', output_hidden_states=True)
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = TFRobertaModel.from_pretrained('roberta-base', config = config)

In [None]:
# Tokenización: title_body
title_body_train_tokens = roberta_tokenizer.batch_encode_plus(title_body_train, max_length = 512, truncation = True, pad_to_max_length = True, return_tensors="tf")
title_body_cv_tokens = roberta_tokenizer.batch_encode_plus(title_body_cv, max_length = 512, truncation = True, pad_to_max_length = True, return_tensors="tf")

tb_train_input_ids  = np.array(title_body_train_tokens['input_ids'])
tb_train_attn_mask = np.array(title_body_train_tokens['attention_mask'])

tb_cv_input_ids  = np.array(title_body_cv_tokens['input_ids'])
tb_cv_attn_mask = np.array(title_body_cv_tokens['attention_mask'])

print(tb_train_input_ids.shape, tb_train_attn_mask.shape, tb_cv_input_ids.shape, tb_cv_attn_mask.shape)

# Tokenización: answer
ans_train_tokens = roberta_tokenizer.batch_encode_plus(answer_train, max_length = 300, truncation = True, pad_to_max_length = True, return_tensors="tf")
ans_cv_tokens = roberta_tokenizer.batch_encode_plus(answer_cv, max_length = 300, truncation = True, pad_to_max_length = True, return_tensors="tf")

ans_train_input_ids  = np.array(ans_train_tokens['input_ids'])
ans_train_attn_mask = np.array(ans_train_tokens['attention_mask'])

ans_cv_input_ids  = np.array(ans_cv_tokens['input_ids'])
ans_cv_attn_mask = np.array(ans_cv_tokens['attention_mask'])
seq_len = ans_train_input_ids.shape[1]

print(ans_train_input_ids.shape, ans_train_attn_mask.shape, ans_cv_input_ids.shape, ans_cv_attn_mask.shape)


In [None]:
#  title_body : TRAIN RoBERTa
batch_size = 32
l = tb_train_input_ids.shape[0]

lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = roberta_model([tb_train_input_ids[i*batch_size:(i+1)*batch_size], tb_train_attn_mask[i*batch_size:(i+1)*batch_size]])[2]

# obtener estados ocultos de las últimas 4 capas (promedio de cada paso de tiempo)
  lst = [i[:, 0, :] for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
tb_RoBERTa_train = tf.concat(lst1, axis = 0)
print(tb_RoBERTa_train.shape)

# title_body : CV RoBERTa
batch_size = 32
l = tb_cv_input_ids.shape[0]

lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = roberta_model([tb_cv_input_ids[i*batch_size:(i+1)*batch_size], tb_cv_attn_mask[i*batch_size:(i+1)*batch_size]])[2]

# obtener estados ocultos de las últimas 4 capas (promedio de cada paso de tiempo)
  lst = [i[:, 0, :] for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
tb_RoBERTa_cv = tf.concat(lst1, axis = 0)
print(tb_RoBERTa_cv.shape)

In [None]:
#  answer : TRAIN RoBERTa
batch_size = 32
l = ans_train_input_ids.shape[0]

lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = roberta_model([ans_train_input_ids[i*batch_size:(i+1)*batch_size], ans_train_attn_mask[i*batch_size:(i+1)*batch_size]])[2]

# obtener estados ocultos de las últimas 4 capas (promedio de cada paso de tiempo)
  lst = [i[:, 0, :] for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
ans_RoBERTa_train = tf.concat(lst1, axis = 0)
print(ans_RoBERTa_train.shape)

# answer : CV BERT
batch_size = 32
l = ans_cv_input_ids.shape[0]
lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = roberta_model([ans_cv_input_ids[i*batch_size:(i+1)*batch_size], ans_cv_attn_mask[i*batch_size:(i+1)*batch_size]])[2]

# obtener estados ocultos de las últimas 4 capas (promedio de cada paso de tiempo)
  lst = [i[:, 0, :] for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
ans_RoBERTa_cv = tf.concat(lst1, axis = 0)
print(ans_RoBERTa_cv.shape)

# XLNet

In [None]:
from transformers import XLNetConfig, XLNetTokenizer, TFXLNetModel

# Cargar modelo/tokenizador preentrenado
config = XLNetConfig.from_pretrained('xlnet-base-cased', output_hidden_states=True)
xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
xlnet_model = TFXLNetModel.from_pretrained('xlnet-base-cased', config = config)

In [None]:
# Tokenización: title_body
title_body_train_tokens = xlnet_tokenizer.batch_encode_plus(title_body_train, max_length = 512, truncation = True, pad_to_max_length = True, return_tensors="tf")
title_body_cv_tokens = xlnet_tokenizer.batch_encode_plus(title_body_cv, max_length = 512, truncation = True, pad_to_max_length = True, return_tensors="tf")

tb_train_input_ids  = np.array(title_body_train_tokens['input_ids'])
tb_train_attn_mask = np.array(title_body_train_tokens['attention_mask'])
tb_train_token_typ_ids = np.array(title_body_train_tokens['token_type_ids'])

tb_cv_input_ids  = np.array(title_body_cv_tokens['input_ids'])
tb_cv_attn_mask = np.array(title_body_cv_tokens['attention_mask'])
tb_cv_token_typ_ids = np.array(title_body_cv_tokens['token_type_ids'])

print(tb_train_input_ids.shape, tb_train_attn_mask.shape, tb_train_token_typ_ids.shape, tb_cv_input_ids.shape, tb_cv_attn_mask.shape, tb_cv_token_typ_ids.shape)

# Tokenización: answer
ans_train_tokens = xlnet_tokenizer.batch_encode_plus(answer_train, max_length = 300, truncation = True, pad_to_max_length = True, return_tensors="tf")
ans_cv_tokens = xlnet_tokenizer.batch_encode_plus(answer_cv, max_length = 300, truncation = True, pad_to_max_length = True, return_tensors="tf")
ans_train_input_ids  = np.array(ans_train_tokens['input_ids'])
ans_train_attn_mask = np.array(ans_train_tokens['attention_mask'])
ans_train_token_typ_ids = np.array(ans_train_tokens['token_type_ids'])

ans_cv_input_ids  = np.array(ans_cv_tokens['input_ids'])
ans_cv_attn_mask = np.array(ans_cv_tokens['attention_mask'])
ans_cv_token_typ_ids = np.array(ans_cv_tokens['token_type_ids'])
seq_len = ans_train_input_ids.shape[1]

print(ans_train_input_ids.shape, ans_train_attn_mask.shape, ans_train_token_typ_ids.shape, ans_cv_input_ids.shape, ans_cv_attn_mask.shape, ans_cv_token_typ_ids.shape)

In [None]:
# train : title_body BERT
batch_size = 32
l = tb_train_input_ids.shape[0]

lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = xlnet_model([tb_train_input_ids[i*batch_size:(i+1)*batch_size], tb_train_attn_mask[i*batch_size:(i+1)*batch_size]])[1]

# obtener estados ocultos de las últimas 4 capas (promedio de cada paso de tiempo)
  lst = [tf.reduce_mean(i, axis = 1) for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
tb_XLNet_train = tf.concat(lst1, axis = 0)
print(tb_XLNet_train.shape)

# cv : title_body BERT
batch_size = 32
l = tb_cv_input_ids.shape[0]

lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = xlnet_model([tb_cv_input_ids[i*batch_size:(i+1)*batch_size], tb_cv_attn_mask[i*batch_size:(i+1)*batch_size]])[1]

# obtener estados ocultos de las últimas 4 capas (promedio de cada paso de tiempo)
  lst = [tf.reduce_mean(i, axis = 1) for i in x[-4:]]
  # lst = [i[:, 0, :] for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
tb_XLNet_cv = tf.concat(lst1, axis = 0)
print(tb_XLNet_cv.shape)

In [None]:
# train : answer BERT
batch_size = 32
l = ans_train_input_ids.shape[0]

lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = xlnet_model([ans_train_input_ids[i*batch_size:(i+1)*batch_size], ans_train_attn_mask[i*batch_size:(i+1)*batch_size]])[1]

 # obtener estados ocultos de las últimas 4 capas (promedio de cada paso de tiempo)
  lst = [tf.reduce_mean(i, axis = 1) for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
ans_XLNet_train = tf.concat(lst1, axis = 0)
print(ans_XLNet_train.shape)

# cv : answer BERT
batch_size = 32
l = ans_cv_input_ids.shape[0]
lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = xlnet_model([ans_cv_input_ids[i*batch_size:(i+1)*batch_size], ans_cv_attn_mask[i*batch_size:(i+1)*batch_size]])[1]

# obtener estados ocultos de las últimas 4 capas (promedio de cada paso de tiempo)
  lst = [tf.reduce_mean(i, axis = 1) for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
ans_XLNet_cv = tf.concat(lst1, axis = 0)
print(ans_XLNet_cv.shape)

# Guardando los modelos

In [None]:
np.savez_compressed('/content/drive/My Drive/Deep Learning - Google Quest/bert_outputs', tb_BERT_train = tb_BERT_train.numpy(), tb_BERT_cv = tb_BERT_cv.numpy(),
                    ans_BERT_train = ans_BERT_train.numpy(), ans_BERT_cv = ans_BERT_cv.numpy())

np.savez_compressed('/content/drive/My Drive/Deep Learning  - Google Quest/use_outputs', tb_USE_train = tb_USE_train, tb_USE_cv = tb_USE_cv,
                    ans_USE_train = ans_USE_train, ans_USE_cv = ans_USE_cv)

np.savez_compressed('/content/drive/My Drive/Deep Learning  - Google Quest/roberta_outputs', tb_RoBERTa_train = tb_RoBERTa_train.numpy(), tb_RoBERTa_cv = tb_RoBERTa_cv.numpy(),
                    ans_RoBERTa_train = ans_RoBERTa_train.numpy(), ans_RoBERTa_cv = ans_RoBERTa_cv.numpy())

np.savez_compressed('/content/drive/My Drive/Deep Learning  - Google Quest/xlnet_outputs', tb_XLNet_train = tb_XLNet_train.numpy(), tb_XLNet_cv = tb_XLNet_cv.numpy(),
                    ans_XLNet_train = ans_XLNet_train.numpy(), ans_XLNet_cv = ans_XLNet_cv.numpy())

# Modelo Final

In [None]:
tf.compat.v1.enable_eager_execution()
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate, BatchNormalization
from tensorflow.keras.models import Model

In [None]:
%%time
tb_BERT_train = np.load('/content/drive/My Drive/Deep Learning  - Google Quest/bert_outputs.npz')['tb_BERT_train']
tb_BERT_cv = np.load('/content/drive/My Drive/Deep Learning  - Google Quest/bert_outputs.npz')['tb_BERT_cv']
ans_BERT_train = np.load('/content/drive/My Drive/Deep Learning  - Google Quest/bert_outputs.npz')['ans_BERT_train']
ans_BERT_cv = np.load('/content/drive/My Drive/Deep Learning  - Google Quest/bert_outputs.npz')['ans_BERT_cv']

tb_USE_train = np.load('/content/drive/My Drive/Deep Learning  - Google Quest/use_outputs.npz')['tb_USE_train']
tb_USE_cv = np.load('/content/drive/My Drive/Deep Learning  - Google Quest/use_outputs.npz')['tb_USE_cv']
ans_USE_train = np.load('/content/drive/My Drive/Deep Learning  - Google Quest/use_outputs.npz')['ans_USE_train']
ans_USE_cv = np.load('/content/drive/My Drive/Deep Learning  - Google Quest/use_outputs.npz')['ans_USE_cv']

tb_RoBERTa_train = np.load('/content/drive/My Drive/Deep Learning  - Google Quest/roberta_outputs.npz')['tb_RoBERTa_train']
tb_RoBERTa_cv = np.load('/content/drive/My Drive/Deep Learning  - Google Quest/roberta_outputs.npz')['tb_RoBERTa_cv']
ans_RoBERTa_train = np.load('/content/drive/My Drive/Deep Learning  - Google Quest/roberta_outputs.npz')['ans_RoBERTa_train']
ans_RoBERTa_cv = np.load('/content/drive/My Drive/Deep Learning  - Google Quest/roberta_outputs.npz')['ans_RoBERTa_cv']

tb_XLNet_train = np.load('/content/drive/My Drive/AAIC Course/Deep Learning  - Google Quest/xlnet_outputs.npz')['tb_XLNet_train']
tb_XLNet_cv = np.load('/content/drive/My Drive/AAIC Course/Deep Learning  - Google Quest/xlnet_outputs.npz')['tb_XLNet_cv']
ans_XLNet_train = np.load('/content/drive/My Drive/Deep Learning  - Google Quest/xlnet_outputs.npz')['ans_XLNet_train']
ans_XLNet_cv = np.load('/content/drive/My Drive/Deep Learning  - Google Quest/xlnet_outputs.npz')['ans_XLNet_cv']

In [None]:
class block(tf.keras.layers.Layer):
  def __init__(self, d1_rate = 0.2, dense_1_units = 1024, d2_rate = 0.2, dense_2_units = 512, d3_rate = 0.1, seed = 42):
    super().__init__()
    self.d1_rate = d1_rate
    self.dense_1_units = dense_1_units
    self.d2_rate = d2_rate
    self.dense_2_units = dense_2_units
    self.d3_rate = d3_rate
    self.seed = seed

  def build(self, input_shape):
    # self.inp = Input(name = 'ans_bert_out', shape = (768*4,), dtype = 'float32')
    self.dropout_1 =  Dropout(rate =self.d1_rate, seed = self.seed)

    self.dense_1 = Dense(units = self.dense_1_units, activation = 'relu', kernel_initializer = tf.keras.initializers.he_normal(seed = self.seed))
    self.dropout_2 = Dropout(rate = self.d2_rate, seed = self.seed)

    self.dense_2 = Dense(units = self.dense_2_units, activation = 'relu', kernel_initializer = tf.keras.initializers.he_normal(seed = self.seed))
    self.dropout_3 = Dropout(rate = self.d3_rate, seed = self.seed)

  def call(self, X):
    dropout_1 = self.dropout_1(X)

    dense_1 = self.dense_1(dropout_1)
    dropout_2 = self.dropout_2(dense_1)

    dense_2 = self.dense_2(dropout_2)
    dropout_3 = self.dropout_3(dense_2)
    return dropout_3

In [None]:
tf.keras.backend.clear_session()
seed = 42

# *-----------------title_body-----------------*
tb_bert_input = Input(name = 'tb_bert_out', shape = (768*4,), dtype = 'float32')
tb_bert_block = block(d1_rate = 0.2, dense_1_units = 1024, d2_rate = 0.2, dense_2_units = 512, d3_rate = 0.1, seed = 42)(tb_bert_input)
tb_bert_out = Dense(units = 21, activation = 'sigmoid', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(tb_bert_block)

tb_use_input = Input(name = 'tb_use_out', shape = (512,), dtype = 'float32')
tb_use_block = block(d1_rate = 0.2, dense_1_units = 1024, d2_rate = 0.2, dense_2_units = 512, d3_rate = 0.2, seed = 42)(tb_use_input)
tb_use_out = Dense(units = 21, activation = 'sigmoid', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(tb_use_block)

tb_roberta_input = Input(name = 'tb_roberta_out', shape = (768*4,), dtype = 'float32')
tb_roberta_block = block(d1_rate = 0.2, dense_1_units = 1024, d2_rate = 0.2, dense_2_units = 512, d3_rate = 0.1, seed = 42)(tb_roberta_input)
tb_roberta_out = Dense(units = 21, activation = 'sigmoid', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(tb_roberta_block)

tb_xlnet_input = Input(name = 'tb_xlnet_out', shape = (768*4,), dtype = 'float32')
tb_xlnet_block = block(d1_rate = 0.2, dense_1_units = 1024, d2_rate = 0.2, dense_2_units = 512, d3_rate = 0.1, seed = 42)(tb_xlnet_input)
tb_xlnet_out = Dense(units = 21, activation = 'sigmoid', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(tb_xlnet_block)

tb_out = tf.reduce_mean([tb_bert_out ,tb_use_out ,tb_roberta_out ,tb_xlnet_out], axis = 0)

# *----------------- answer -----------------*
ans_bert_input = Input(name = 'ans_bert_out', shape = (768*4,), dtype = 'float32')
ans_bert_block = block(d1_rate = 0.2, dense_1_units = 2048, d2_rate = 0.2, dense_2_units = 512, d3_rate = 0.1, seed = 42)(ans_bert_input)
ans_bert_out = Dense(units = 9, activation = 'sigmoid', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(ans_bert_block)

ans_use_input = Input(name = 'ans_use_out', shape = (512,), dtype = 'float32')
ans_use_concat = tf.concat([tb_use_out, ans_use_input], axis = -1)
ans_use_block = block(d1_rate = 0.2, dense_1_units = 1024, d2_rate = 0.2, dense_2_units = 512, d3_rate = 0.2, seed = 42)(ans_use_concat)
ans_use_out = Dense(units = 9, activation = 'sigmoid', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(ans_use_block)

ans_roberta_input = Input(name = 'ans_roberta_out', shape = (768*4,), dtype = 'float32')
ans_roberta_concat = tf.concat([tb_roberta_out, ans_roberta_input], axis = -1)
ans_roberta_block = block(d1_rate = 0.2, dense_1_units = 1024, d2_rate = 0.2, dense_2_units = 512, d3_rate = 0.1, seed = 42)(ans_roberta_concat)
ans_roberta_out = Dense(units = 9, activation = 'sigmoid', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(ans_roberta_block)

ans_xlnet_input = Input(name = 'ans_xlnet_out', shape = (768*4,), dtype = 'float32')
ans_xlnet_block = block(d1_rate = 0.2, dense_1_units = 1024, d2_rate = 0.2, dense_2_units = 512, d3_rate = 0.1,  seed = 42)(ans_xlnet_input)
ans_xlnet_out = Dense(units = 9, activation = 'sigmoid', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(ans_xlnet_block)

ans_out = tf.reduce_mean([ans_bert_out ,ans_use_out ,ans_roberta_out ,ans_xlnet_out], axis = 0)

# *----------------- concat -----------------*
out =  Concatenate(axis = -1)([tb_out, ans_out])

model = Model(inputs = [tb_bert_input, tb_use_input, tb_roberta_input, tb_xlnet_input, ans_bert_input, ans_use_input, ans_roberta_input, ans_xlnet_input], outputs = out)
model.summary()

In [None]:
# Posprocesamiento: binning
def return_bins(arr):
  val = np.unique(arr)
  bins = []
  for i in range(len(val)):
    if i > 0:
      bins.append((val[i-1] + val[i])/2)
  return bins

unique_val_30 = [np.unique(train[tar_features].values[:, i]) for i in range(30)]
bins_30 = [return_bins(train[tar_features].values[:, i]) for i in range(30)]

def binned_out(y_pred):
  col = y_pred.shape[1]
  final_pred = np.zeros(y_pred.shape)
  for i in range(col):
    idx = np.digitize(y_pred[:, i], bins_30[i])
    final_pred[:, i] = unique_val_30[i][idx]
  return final_pred

In [None]:
# Definición callbacks
!rm -r '/content/saved models'
!rm -r '/content/logs'
!mkdir '/content/saved models'
!mkdir '/content/logs/'

# tensorboard callback
import datetime
log_dir="logs/" + datetime.datetime.now().strftime("%Y-%m-%d %H_%M_%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir = log_dir, histogram_freq=1, write_graph=True, write_grads=True)

# spearman funcion
from scipy.stats import pearsonr, spearmanr
def compute_spearman(y_true, y_pred, final_pred):
  col = y_true.shape[1]
  lst = []
  for i in range(col):
    p = round(spearmanr(y_true[:, i], final_pred[:, i])[0], 5)
    p = round(p, 5)
    lst.append(p)
  return np.array(lst), round(np.nanmean(lst), 5)

# # Métrica spearman personalizada
class print_spearman(tf.keras.callbacks.Callback):
    def __init__(self, train_data, validation_data):
        super(tf.keras.callbacks.Callback, self).__init__()
        self.x, self.y = train_data
        self.val_x, self.val_y = validation_data

    def on_train_begin(self, logs={}):
        self.all_feat_spearman = []
        self.spearman_dict = {'train_spearman' :[], 'val_spearman' :[]}

    def on_epoch_end(self, epoch, logs={}):
        self.epoch = epoch

        #  Test_set evaluación
        print('\nspearman :')
        y_pred = self.model.predict(x = self.x)
        y_pred_val = self.model.predict(x = self.val_x)

        final_pred = binned_out(y_pred)
        final_pred_val = binned_out(y_pred_val)

        train_spear_lst, train_spearman = compute_spearman(self.y, y_pred, final_pred)
        val_spear_lst, val_spearman = compute_spearman(self.val_y, y_pred_val, final_pred_val)

        self.all_feat_spearman.append({'train_spearman' : train_spear_lst, 'val_spearman' : val_spear_lst})

        self.spearman_dict['train_spearman'].append(train_spearman)
        self.spearman_dict['val_spearman'].append(val_spearman)
        prev_epoch_lr  = tf.keras.backend.eval(self.model.optimizer.lr)
        print("train_spearman : {} | val_spearman : {} | Learning_Rate : {}".format(train_spearman, val_spearman, round(prev_epoch_lr, 6)))
        print('train_spear_lst : ', train_spear_lst, '\n' 'val_spear_lst :', val_spear_lst)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor= 'val_loss', factor=np.sqrt(0.1), patience=7, verbose=1)

checkpt = tf.keras.callbacks.ModelCheckpoint('/content/saved models/weights-{epoch:03d}-{val_loss:.5f}.hdf5', monitor='val_loss', verbose=1, save_best_only=False, save_weights_only=True)

print_spearman_fn = print_spearman(train_data = ([tb_BERT_train, tb_USE_train, tb_RoBERTa_train, tb_XLNet_train, ans_BERT_train, ans_USE_train, ans_RoBERTa_train, ans_XLNet_train],
                                                 y_train.values),
                                 validation_data = ([tb_BERT_cv, tb_USE_cv, tb_RoBERTa_cv, tb_XLNet_cv, ans_BERT_cv, ans_USE_cv, ans_RoBERTa_cv, ans_XLNet_cv],
                                                    y_cv.values))
callbacks = [print_spearman_fn, reduce_lr, checkpt, tensorboard_callback]

In [None]:
# best_model : entrenamiento de un modelo
tf.keras.backend.clear_session()
opt = tf.keras.optimizers.Adam(learning_rate = 0.00008)
rmse = tf.keras.metrics.RootMeanSquaredError()

model.compile(loss = 'binary_crossentropy', optimizer = opt,  metrics = [rmse])
history = model.fit(x = [tb_BERT_train, tb_USE_train, tb_RoBERTa_train, tb_XLNet_train, ans_BERT_train, ans_USE_train, ans_RoBERTa_train, ans_XLNet_train],
                    y =  y_train.values,
                    validation_data = ([tb_BERT_cv, tb_USE_cv, tb_RoBERTa_cv, tb_XLNet_cv, ans_BERT_cv, ans_USE_cv, ans_RoBERTa_cv, ans_XLNet_cv],
                                       y_cv.values),
                    batch_size = 64, epochs = 60, callbacks = callbacks)

In [None]:
# best_results : (val_spearman, epoch_no)
max(print_spearman_fn.spearman_dict['val_spearman']), np.argmax(print_spearman_fn.spearman_dict['val_spearman'])

In [None]:
# guardando el mejor modelo
!cp '/content/saved models/weights.-029-0.37055.data-00000-of-00001' '/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/'
!cp '/content/saved models/weights.-029-0.37055.index' '/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/'

In [None]:
model.load_weights('/content/saved models/weights.-029-0.37055')

In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/logs/

# Analisis del Error

In [None]:
train_spear_lst, val_spear_lst = print_spearman_fn.all_feat_spearman[30]['train_spearman'], print_spearman_fn.all_feat_spearman[30]['val_spearman']
train_spear_lst[np.isnan(train_spear_lst)] = 0.0
val_spear_lst[np.isnan(val_spear_lst)] = 0.0

print('train_spear_lst :', train_spear_lst, '\n\nval_spear_lst :', val_spear_lst)

In [None]:
# Diferencia entre baseline_val_spearman y el modelo actual val_spearman
baseline_val_spear = np.array([0.27277, 0.5568 , 0.45952, 0.21   , 0.28364, 0.40028, 0.2506 , 0.44999, 0.43819,
 0.04555, 0.35599, 0.6036 , 0.55155, 0.0979 , 0.50172, 0.45533, 0.74697, 0.21801,
 0.53282, 0.03014, 0.49008, 0.24314, 0.28342, 0.08022, 0.184  , 0.29888, 0.64717,
 0.13953, 0.53439, 0.10737])
print('featurewise change :', val_spear_lst - baseline_val_spear)

In [None]:
# gráfico de Spearman para cada característica:
plt.figure(figsize = (16, 8))
plt.plot(train_spear_lst, 'o')
plt.plot(val_spear_lst, 'o')
plt.plot(train_spear_lst, label = 'train_spearman')
plt.plot(val_spear_lst, label = 'val_spearman')
plt.grid()
plt.legend()
plt.xticks(ticks = range(30), labels = range(1, 31))
plt.yticks(ticks = np.arange(0, 1.1, 0.1), labels =  np.arange(0, 1.1, 0.1, dtype = np.float32))
plt.show()

In [None]:
# Calculo de val_spearman con la mejor y la peor característica (el umbral es val_spearman > 0,20 = buena característica objetivo)
below_20_idx = [i for i in range(len(val_spear_lst)) if val_spear_lst[i] < 0.2]
above_20_idx = [i for i in range(30)]
for i in below_20_idx:
  above_20_idx.remove(i)

print('mean spearman for best target features :', np.mean(val_spear_lst[above_20_idx]), '\nmean spearman for worst target features :', np.mean(val_spear_lst[below_20_idx]))

In [None]:
# Nombres de características objetivo de buen y mal desempeño
print('best target features :', np.array(tar_features)[above_20_idx], '\n\nworst target features :', np.array(tar_features)[below_20_idx])

**¿Por qué estas características no funcionan bien?**




In [None]:
# -> 1. basics
train[np.array(tar_features)[below_20_idx]].describe()

In [None]:
# -> 2. pdfs
plt.figure(figsize = (30, 11))
for i in range(6):
  plt.subplot(2, 3, i+1)
  column = np.array(tar_features)[below_20_idx][i]
  sns.distplot(train[column].values, hist = True, kde=True)

  plt.xlabel(column)
  plt.ylabel('count')
  plt.grid()
plt.show()

In [None]:
# 3. -> bins
for i in np.array(tar_features)[below_20_idx]:
  print(train[i].value_counts(), '\n')

**¿Por qué estas características funcionan bien?**

In [None]:
# top 6 caracteristicas
top_06_idx = np.argsort(val_spear_lst)[-6:]

# nombres de las 6 características principales características objetivo
print('top_6 target features :', np.array(tar_features)[top_06_idx])

In [None]:
# -> 1. basics
train[np.array(tar_features)[top_06_idx]].describe()

In [None]:
# -> 2. pdfs
plt.figure(figsize = (30, 11))
for i in range(6):
  plt.subplot(2, 3, i+1)
  column = np.array(tar_features)[top_06_idx][i]
  sns.distplot(train[column].values, hist = True, kde=True)

  plt.xlabel(column)
  plt.ylabel('count')
  plt.grid()
plt.show()

In [None]:
# 3. -> bins
for i in np.array(tar_features)[top_06_idx]:
  print(train[i].value_counts(), '\n')

**¿Cuáles son los tipos de puntos de datos que dan las mejores y peores predicciones?**

In [None]:
y_pred_val = model.predict([tb_BERT_cv, tb_USE_cv, tb_RoBERTa_cv, tb_XLNet_cv, ans_BERT_cv, ans_USE_cv, ans_RoBERTa_cv, ans_XLNet_cv]) # cv_prediction
final_pred_val = binned_out(y_pred_val) # agrupando la salida del modelo

# Calcular el número de ceros en cada predicción de punto de datos
zero_lst = []
for i in range(len(y_cv)):
  zero_lst.append(len(np.where((y_cv.values - final_pred_val)[i] == 0.0)[0]))

In [None]:
# 1. basics
pd.DataFrame({'zero_lst' : zero_lst}).describe()

In [None]:
# 2. mejores y peores puntos de datos
top_10_dp_idx = np.argsort(zero_lst)[-10:][::-1]
bottom_10_dp_idx = np.argsort(zero_lst)[:10]

print('best_10 title_body :', np.array(title_body_cv)[top_10_dp_idx], '\n')
print('worst_10 title_body :', np.array(title_body_cv)[bottom_10_dp_idx], '\n')
print('best_10 answer :', np.array(answer_cv)[top_10_dp_idx], '\n')
print('worst_10 answer :', np.array(answer_cv)[bottom_10_dp_idx], '\n')

In [None]:
# 3. Comprobación de la longitud de las palabras de los mejores y peores puntos de datos
print('best_10 title_body :', [len(i.split(' ')) for i in np.array(title_body_cv)[top_10_dp_idx]])
print('worst_10 title_body :', [len(i.split(' ')) for i in np.array(title_body_cv)[bottom_10_dp_idx]])
print('best_10 answer :',  [len(i.split(' ')) for i in np.array(answer_cv)[top_10_dp_idx]])
print('worst_10 answer :',  [len(i.split(' ')) for i in np.array(answer_cv)[bottom_10_dp_idx]])

# **Resultado Final**

In [None]:
from prettytable import PrettyTable
x = PrettyTable()

x.field_names = ["Model", "train_spearman", "val_spearman"]
x.add_row(["baseline_LSTM", 0.4995, 0.34899])
x.add_row(["BERT", 0.50081, 0.39536])
x.add_row(["USE",  0.52245, 0.4366])
x.add_row(["RoBERTa", 0.48059, 0.41772])
x.add_row(["XLNet", 0.52237, 0.37525])
x.add_row(["BERT_USE_RoBERTa_XLNet", 0.5068, 0.43924])

print(x)