In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
DATA_PATH = "drive/My Drive/colab/DeepNLP/data/"
FILE_NAME = "cb_join_gte4_onehot.tsv"

In [3]:
!pip install -q transformers
!pip install -q tensorflow

In [4]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
import tensorflow as tf

In [5]:
from torch import cuda
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

Found GPU at: /device:GPU:0


'Tesla P100-PCIE-16GB'

In [6]:
df = pd.read_csv(DATA_PATH + FILE_NAME, delimiter="\t").drop(labels=["relation"], axis=1)
df.head()

Unnamed: 0,sentence,R-3-50,R-2-50,R-14-50,R-5B-7,R-3-13,R-4-50,R-12-14,R-13-7,R-1-5,R-13-50,R-1-2,R-5B-14,R-7-14,R-5-5B,R-1-7,R-6-50,R-12-13,R-3-1,R-5B-50,R-4-5B,R-7-5B,R-1-3,R-7-50,R-3-4,R-11-50,R-3-6,R-11-12,R-5-7,R-3-14,R-3-5,R-4-14,R-5B-5,R-3-7,R-14-6,R-7-6,R-2-3,R-6-14,R-11-14,R-13-14,R-4-5,R-13-11,R-6-7,R-50-7,R-1-50,R-50-50,R-1-4,R-4-7,R-11-13,R-4-3,R-12-50,R-5-50,R-1-6,R-11-3,R-50-3
0,Coral and zooxanthellae depend an each other i...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"If the coral dies, or gets bleached, then the ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Or the other way around.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"In the text Shifting Trade Winds, it talks abo...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,And another source states how when the water t...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
cols = list(df)
label_cols = cols[1:len(cols)]
num_labels = len(label_cols)
label_cols[:10]

['R-3-50',
 'R-2-50',
 'R-14-50',
 'R-5B-7',
 'R-3-13',
 'R-4-50',
 'R-12-14',
 'R-13-7',
 'R-1-5',
 'R-13-50']

In [8]:
for rel in label_cols:
  df[rel] = df[rel].astype(int)
df.head()

Unnamed: 0,sentence,R-3-50,R-2-50,R-14-50,R-5B-7,R-3-13,R-4-50,R-12-14,R-13-7,R-1-5,R-13-50,R-1-2,R-5B-14,R-7-14,R-5-5B,R-1-7,R-6-50,R-12-13,R-3-1,R-5B-50,R-4-5B,R-7-5B,R-1-3,R-7-50,R-3-4,R-11-50,R-3-6,R-11-12,R-5-7,R-3-14,R-3-5,R-4-14,R-5B-5,R-3-7,R-14-6,R-7-6,R-2-3,R-6-14,R-11-14,R-13-14,R-4-5,R-13-11,R-6-7,R-50-7,R-1-50,R-50-50,R-1-4,R-4-7,R-11-13,R-4-3,R-12-50,R-5-50,R-1-6,R-11-3,R-50-3
0,Coral and zooxanthellae depend an each other i...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"If the coral dies, or gets bleached, then the ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Or the other way around.,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"In the text Shifting Trade Winds, it talks abo...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,And another source states how when the water t...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
df['relations'] = list(df[label_cols].values)

In [10]:
df2 = df.copy(deep=True).filter(['sentence', 'relations'])
df2.head(5)

Unnamed: 0,sentence,relations
0,Coral and zooxanthellae depend an each other i...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"If the coral dies, or gets bleached, then the ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Or the other way around.,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"In the text Shifting Trade Winds, it talks abo...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,And another source states how when the water t...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [11]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 4
EPOCHS = 4
LEARNING_RATE = 2e-5
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [12]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.sentence = dataframe.sentence
        self.targets = self.data.relations
        self.max_len = max_len

    def __len__(self):
        return len(self.sentence)

    def __getitem__(self, index):
        sentence = str(self.sentence[index])
        sentence = " ".join(sentence.split())

        inputs = self.tokenizer.encode_plus(
            sentence,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [13]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=df2.sample(frac=train_size,random_state=42)
test_dataset=df2.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df2.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (10329, 2)
TRAIN Dataset: (8263, 2)
TEST Dataset: (2066, 2)


In [14]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [15]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', return_dict=False)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, len(label_cols))
  
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [16]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [17]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [18]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids,)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [19]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  0.7231941223144531
Epoch: 1, Loss:  0.055757276713848114
Epoch: 2, Loss:  0.03594506159424782
Epoch: 3, Loss:  0.041325900703668594


In [20]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [65]:
threshold = 0.06

outputs_global, targets_global = [], []
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs_global.append(outputs)
    targets_global.append(targets)
    outputs = np.array(outputs) >= threshold
    accuracy = metrics.accuracy_score(targets, outputs.round())
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

  average, "true nor predicted", 'F-score is', len(true_sum)


Accuracy Score = 0.5813165537270087
F1 Score (Micro) = 0.2804213585179804
F1 Score (Macro) = 0.0492093328547252
Accuracy Score = 0.5813165537270087
F1 Score (Micro) = 0.2804213585179804
F1 Score (Macro) = 0.0492093328547252
Accuracy Score = 0.5813165537270087
F1 Score (Micro) = 0.2804213585179804
F1 Score (Macro) = 0.0492093328547252
Accuracy Score = 0.5813165537270087
F1 Score (Micro) = 0.2804213585179804
F1 Score (Macro) = 0.0492093328547252


In [66]:
out = []
for arr in outputs_global:
  out += arr
out = [[1 if i >= threshold else 0 for i in j] for j in out]

target = []
for arr in targets_global:
  target += arr

In [67]:
print(metrics.classification_report(target, out))

              precision    recall  f1-score   support

           0       0.17      0.72      0.27       376
           1       0.00      0.00      0.00        12
           2       0.00      0.00      0.00        48
           3       0.00      0.00      0.00        16
           4       0.00      0.00      0.00         0
           5       0.25      0.17      0.20        96
           6       0.00      0.00      0.00        16
           7       0.00      0.00      0.00        12
           8       0.00      0.00      0.00        16
           9       0.00      0.00      0.00        96
          10       0.00      0.00      0.00       132
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00        12
          13       0.00      0.00      0.00        52
          14       0.00      0.00      0.00        16
          15       0.00      0.00      0.00        60
          16       1.00      0.06      0.11        72
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [68]:
# outputs_global[0][:5]