<a href="https://colab.research.google.com/github/expert-search/glg-ack/blob/main/cgc-1.3-DistilBERT_With_HuggingFace.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GrailQA Dataset Exploration w/ BERT (w/ domain selection and mapping)

Original (BERT model) adpated from https://towardsdatascience.com/multi-label-multi-class-text-classification-with-bert-transformer-and-keras-c6355eccb63a

Additional DistilBERT info gleaned from https://medium.com/nerd-for-tech/building-a-personal-ai-assistant-part-2-afb26c2a3b5b

In [1]:
!pip install transformers
from transformers import TFDistilBertModel, DistilBertConfig, DistilBertTokenizerFast

from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.math import confusion_matrix
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy, Precision, Recall
from tensorflow.keras.utils import to_categorical

import pandas as pd
from sklearn.model_selection import train_test_split



In [2]:
!pip install tensorflow_addons
from tensorflow_addons.metrics import F1Score



In [3]:
!pip install datasets
from datasets import load_dataset
dataset = load_dataset(
   'grail_qa')



Using custom data configuration default
Reusing dataset grail_qa (/root/.cache/huggingface/datasets/grail_qa/default/0.0.0/2be99c6902e842f3ba87dd36fa96a2236206438ebb67c5e5464a36e4049fd3fb)


In [4]:
df_train = pd.DataFrame(dataset['train'])
df_valid = pd.DataFrame(dataset['validation'])
df_test = pd.DataFrame(dataset['test'])

In [5]:
df_train.head()

Unnamed: 0,qid,question,answer,function,num_node,num_edge,graph_query,sparql_query,domains,level,s_expression
0,2101535001000,oxybutynin chloride 5 extended release film co...,"{'answer_type': ['Entity', 'Entity'], 'answer_...",none,2,1,"{'nodes': {'nid': [0, 1], 'node_type': ['class...",PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-...,[medicine],,(AND medicine.routed_drug (JOIN medicine.route...
1,2100954014000,the type single-sex school are in which instit...,"{'answer_type': ['Entity', 'Entity', 'Entity',...",none,2,1,"{'nodes': {'nid': [0, 1], 'node_type': ['class...",PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-...,[education],,(AND education.educational_institution (JOIN e...
2,3206374001000,the leaders of the earliest established religi...,"{'answer_type': ['Entity'], 'answer_argument':...",argmin,3,2,"{'nodes': {'nid': [0, 1, 2], 'node_type': ['cl...",PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-...,[religion],,(ARGMIN religion.religious_leadership_title (J...
3,2100735000000,"on 07/01/1970, which warship v1.1 was hit?","{'answer_type': ['Entity'], 'answer_argument':...",none,2,1,"{'nodes': {'nid': [0, 1], 'node_type': ['class...",PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-...,[user.patrick.default_domain],,(AND user.patrick.default_domain.warship_v1_1 ...
4,2102770001000,what is the language regulator of basque?,"{'answer_type': ['Entity'], 'answer_argument':...",none,2,1,"{'nodes': {'nid': [0, 1], 'node_type': ['class...",PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-...,[language],,(AND language.language_regulator (JOIN languag...


The test partition has empty lists for the 'domains' column 🤔

In [6]:
df_train['label'] = pd.Categorical([domains[0] for domains in df_train['domains']])
df_valid['label'] = pd.Categorical([domains[0] for domains in df_valid['domains']])

In [7]:
len(df_train[(df_train['label'] == 'medicine')])

2013

In [8]:
domains_to_keep = ['medicine', 'computer', 'spaceflight', 'biology', 'automotive', 'internet', 'engineering']

In [9]:
df_train_other = df_train[df_train['label'].isin(domains_to_keep) == False]
df_valid_other = df_valid[df_valid['label'].isin(domains_to_keep) == False]

In [10]:
df_train = df_train[df_train['label'].isin(domains_to_keep)]
df_valid = df_valid[df_valid['label'].isin(domains_to_keep)]

In [11]:
domain_map = {
    'medicine': 'healthcare',
    'computer': 'technology',
    'spaceflight': 'technology',
    'biology': 'healthcare',
    'automotive': 'technology',
    'internet': 'technology',
    'engineering': 'technology'
}
df_train['label'] = df_train['label'].map(domain_map)
df_valid['label'] = df_valid['label'].map(domain_map)

In [12]:
df_train_other['label'] = 'other'
df_valid_other['label'] = 'other'

In [13]:
df_train_other_subset = df_train_other.sample(n=3700, random_state=42)
df_valid_other_subset = df_valid_other.sample(n=350, random_state=42)

In [14]:
df_train = pd.concat([df_train, df_train_other_subset])
df_valid = pd.concat([df_valid, df_valid_other_subset])

In [15]:
df_train['label'] = pd.Categorical([label for label in df_train['label']])
df_valid['label'] = pd.Categorical([label for label in df_valid['label']])
df_train['numeric_label'] = df_train['label'].cat.codes
df_valid['numeric_label'] = df_valid['label'].cat.codes

In [16]:
print(len(df_train['label'].unique()))
df_train['label'].value_counts()

3


technology    4989
other         3700
healthcare    3286
Name: label, dtype: int64

In [17]:
df_valid['label'].value_counts()

technology    409
other         350
healthcare    304
Name: label, dtype: int64

### Fetch DistilBERT pre-trained encoder and tokenizer

In [18]:
#######################################
### --------- Setup BERT ---------- ###
# Name of the BERT model to use
model_name = 'distilbert-base-uncased'
# Max length of tokens
max_length = 100
# Load transformers config
config = DistilBertConfig.from_pretrained(model_name)
#config = DistilBertConfig()
config.output_hidden_states = True
config.num_labels = len(df_train['label'].unique())
# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained(pretrained_model_name_or_path=model_name,
                                                    config=config)
# Load the Transformers DistilBERT model
transformer_model = TFDistilBertModel.from_pretrained(model_name,
                                                      config =config)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'activation_13', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [19]:
#######################################
### ------- Build the model ------- ###
# TF Keras documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Model
# Load the MainLayer
bert = transformer_model.layers[0]
# Build your model input
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
inputs = {'input_ids': input_ids}
# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[0]
dropout = Dropout(config.dropout, name='pooled_output')
pooled_output = dropout(bert_model, training=False)
cls_token = pooled_output[:, 0, :]
# Then build your model output
label = Dense(units=len(df_train['numeric_label'].value_counts()),
              kernel_initializer=TruncatedNormal(
                  stddev=config.initializer_range
                  ), 
              name='label')(cls_token)
outputs = {'label': label}
# And combine it all in a model object
model = Model(inputs=inputs, outputs=outputs, name='DistilBERT_MultiClass')
# Take a look at the model
model.summary()

Model: "DistilBERT_MultiClass"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 100)]             0         
_________________________________________________________________
distilbert (TFDistilBertMain TFBaseModelOutput(last_hi 66362880  
_________________________________________________________________
pooled_output (Dropout)      (None, 100, 768)          0         
_________________________________________________________________
tf.__operators__.getitem (Sl (None, 768)               0         
_________________________________________________________________
label (Dense)                (None, 3)                 2307      
Total params: 66,365,187
Trainable params: 66,365,187
Non-trainable params: 0
_________________________________________________________________


### Model Training

In [20]:
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0
    )

loss = {'label': CategoricalCrossentropy(from_logits = True)}
metric = {'label': CategoricalAccuracy('accuracy')}

model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

y_label = to_categorical(df_train['numeric_label'])

x = tokenizer(
    text=df_train['question'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=False,
    verbose=True)

history = model.fit(
    x={'input_ids': x['input_ids']},
    y={'label': y_label},
    validation_split=0.2,
    batch_size=64,
    epochs=5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [21]:
from google.colab import drive
drive.mount('/content/drive')
import os
# The path below should point to the directory containing this notebook and the associated utility files
# Change it if necessary
os.chdir('/content/drive/MyDrive/')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
#model.save('model_bert_w_other_label.hdf5')
with open('distilbert_model.json', 'w') as f:
    f.write(model.to_json())
model.save_weights('distilbert_weights.hdf5')

### Evalutation

Using the dev partition since the test partition is unlabeled.

In [23]:
test_y = to_categorical(df_valid['numeric_label'])
test_x = tokenizer(
    text=df_valid['question'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding="max_length",
    pad_to_max_length=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=False,
    verbose=True)

model_eval = model.evaluate(
    x={'input_ids': test_x['input_ids']},
    y={'label': test_y}
)



### Verify Accuracy

In [24]:
preds = model.predict(x={'input_ids': test_x['input_ids']})

In [25]:
from tensorflow.math import argmax
correct = 0
for pred, expected in zip(argmax(preds['label'], axis=1),
                          df_valid['numeric_label']):
  if pred == expected:
    correct += 1
print(f"Accuracy: {correct / len(df_valid)}")

Accuracy: 0.9303857008466604


### Confusion Matrix

In [26]:
from tensorflow import argmax
confusion_matrix(df_valid['numeric_label'],
                 argmax(preds['label'], axis=1),
                 num_classes=len(df_valid['label'].unique()))

<tf.Tensor: shape=(3, 3), dtype=int32, numpy=
array([[304,   0,   0],
       [ 11, 277,  62],
       [  0,   1, 408]], dtype=int32)>

### OOD Evaluation
[Yahoo QA Dataset](https://huggingface.co/datasets/viewer/?dataset=yahoo_answers_qa)

In [27]:
yahoo_dataset = load_dataset(
   'yahoo_answers_qa')

Reusing dataset yahoo_answers_qa (/root/.cache/huggingface/datasets/yahoo_answers_qa/yahoo_answers_qa/1.0.0/62f63c2dc317317049c5a213c97370fe2989ead076488347df250a4b35da10d7)


In [28]:
yahoo_df = pd.DataFrame(yahoo_dataset['train'])

In [29]:
yahoo_df.head()

Unnamed: 0,id,question,answer,nbestanswers,main_category
0,2020338,Why did the U.S Invade Iraq ?,A small group of politicians believed strongly...,[A small group of politicians believed strongl...,News & Events
1,2874684,How to get rid of a beehive?,Call an area apiarist. They should be able to...,[Call an area apiarist. They should be able t...,Education & Reference
2,4193114,Why don't European restaurants serve water?,There's a general belief in Europe (and in fac...,[There's a general belief in Europe (and in fa...,Society & Culture
3,1908421,Why hybrid cars gas mileage is better in city ?,hybrid cars save energy in two ways: 1.by stor...,[hybrid cars save energy in two ways: 1.by sto...,Cars & Transportation
4,3608897,Can someone explain the theory of e=mc2?,In general it means that in a very high speed ...,[In general it means that in a very high speed...,Science & Mathematics


In [30]:
yahoo_df['main_category'].unique()

array(['News & Events', 'Education & Reference', 'Society & Culture',
       'Cars & Transportation', 'Science & Mathematics',
       'Politics & Government', 'Pets', 'Food & Drink',
       'Business & Finance', 'Computers & Internet', 'Games & Recreation',
       'Health', 'Travel', 'Social Science', 'Yahoo! Products',
       'Consumer Electronics', 'Arts & Humanities', 'Dining Out',
       'Local Businesses', 'Asia Pacific', 'Yahoo!7 Products'],
      dtype=object)

In [31]:
yahoo_df['main_category'].value_counts()

Computers & Internet     12229
Health                   10699
Science & Mathematics     8728
Society & Culture         8651
Business & Finance        8298
Education & Reference     7343
Yahoo! Products           5335
Politics & Government     4788
Cars & Transportation     3647
Arts & Humanities         3521
Food & Drink              3174
Pets                      3069
Consumer Electronics      2038
Games & Recreation        1591
Social Science            1407
Travel                    1357
News & Events              843
Dining Out                 325
Local Businesses           273
Yahoo!7 Products            36
Asia Pacific                10
Name: main_category, dtype: int64

In [32]:
yahoo_df['label'] = pd.Categorical([main_cat for main_cat in yahoo_df['main_category']])

In [33]:
domains_to_keep = ['Computers & Internet', 'Health']

In [34]:
yahoo_df_other = yahoo_df[yahoo_df['label'].isin(domains_to_keep) == False]
yahoo_df = yahoo_df[yahoo_df['label'].isin(domains_to_keep)]

In [35]:
domain_map = {
    'Health': 'healthcare',
    'Computers & Internet': 'technology'
}
yahoo_df['label'] = yahoo_df['label'].map(domain_map)

In [36]:
yahoo_df_other['label'] = 'other'
yahoo_df_other_subset = yahoo_df_other.sample(n=11000, random_state=42)

In [37]:
yahoo_df = pd.concat([yahoo_df, yahoo_df_other_subset])

In [38]:
yahoo_df['label'].value_counts()

technology    12229
other         11000
healthcare    10699
Name: label, dtype: int64

In [39]:
yahoo_df['label'] = pd.Categorical([label for label in yahoo_df['label']])
yahoo_df['numeric_label'] = yahoo_df['label'].cat.codes

In [40]:
yahoo_df.head()

Unnamed: 0,id,question,answer,nbestanswers,main_category,label,numeric_label
24,599506,What is CGI?,Common Gateway Interface (CGI) is an important...,[Common Gateway Interface (CGI) is an importan...,Computers & Internet,technology,2
26,2472154,Why are blueberries so good for your health?,"Blueberries are high in anti-oxidants. Also, t...","[Blueberries are high in anti-oxidants. Also, ...",Health,healthcare,0
31,2519839,Why do women get PMS?,Premenstrual syndrome (PMS) is a group of symp...,[Premenstrual syndrome (PMS) is a group of sym...,Health,healthcare,0
33,2802889,Why doesn't Yahoo mail beta support SafarI on ...,Because Firefox and IE comprise the majority o...,[Because Firefox and IE comprise the majority ...,Computers & Internet,technology,2
36,1947336,How should I encrypt/pw protect my backup file...,I recently backed up and restored a bunch of f...,[I recently backed up and restored a bunch of ...,Computers & Internet,technology,2


In [41]:
yahoo_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33928 entries, 24 to 14141
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   id             33928 non-null  object  
 1   question       33928 non-null  object  
 2   answer         33928 non-null  object  
 3   nbestanswers   33928 non-null  object  
 4   main_category  33928 non-null  object  
 5   label          33928 non-null  category
 6   numeric_label  33928 non-null  int8    
dtypes: category(1), int8(1), object(5)
memory usage: 1.6+ MB


In [42]:
test_y = to_categorical(yahoo_df['numeric_label'])
test_x = tokenizer(
    text=yahoo_df['question'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding="max_length",
    pad_to_max_length=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=False,
    verbose=True)

model_eval = model.evaluate(
    x={'input_ids': test_x['input_ids']},
    y={'label': test_y}
)



#### Verify Accuracy + Confusion Matrix

In [43]:
yahoo_preds = model.predict(x={'input_ids': test_x['input_ids']})
correct = 0
for pred, expected in zip(argmax(yahoo_preds['label'], axis=1),
                          yahoo_df['numeric_label']):
  if pred == expected:
    correct += 1
print(f"Accuracy: {correct / len(yahoo_df)}")

Accuracy: 0.7158394246639943


In [44]:
confusion_matrix(yahoo_df['numeric_label'],
                 argmax(yahoo_preds['label'], axis=1),
                 num_classes=len(yahoo_df['label'].unique()))

<tf.Tensor: shape=(3, 3), dtype=int32, numpy=
array([[7783, 2571,  345],
       [1950, 6976, 2074],
       [ 440, 1809, 9980]], dtype=int32)>