In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/9c/34/fb092588df61bf33f113ade030d1cbe74fb73a0353648f8dd938a223dce7/transformers-3.5.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 4.1MB/s 
[?25hCollecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 43.5MB/s 
[?25hCollecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 54.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from transformers import RobertaConfig, RobertaModel, RobertaTokenizer
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
ROOT_PATH = "/content/drive/My Drive/Magíster 2020"

In [None]:
puc_train = pd.read_json(f"{ROOT_PATH}/Ordenado/new_train_test_data/puc_train_70.json", orient="records")
puc_test = pd.read_json(f"{ROOT_PATH}/Ordenado/new_train_test_data/puc_test_30.json", orient="records")

puj_train = pd.read_json(f"{ROOT_PATH}/Ordenado/new_train_test_data/puj_train_70.json", orient="records")
puj_test = pd.read_json(f"{ROOT_PATH}/Ordenado/new_train_test_data/puj_test_30.json", orient="records")

aurora_train = pd.read_json(f"{ROOT_PATH}/Ordenado/new_train_test_data/aurora_train_70.json", orient="records")
aurora_test = pd.read_json(f"{ROOT_PATH}/Ordenado/new_train_test_data/aurora_test_30.json", orient="records")

train = pd.concat([puc_train, puj_train, aurora_train], ignore_index=True)
test = pd.concat([puc_test, puj_test, aurora_test], ignore_index=True)

In [None]:
MAX_LENGTH = 512

In [None]:
class MyDataset(Dataset):
    def __init__(self, data):
        self.data: pd.DataFrame = data
        self.abstracts: np.ndarray = data["clean_abstract"].values
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=False)
        self.titles: np.ndarray = data["title"].values
        self.labels: np.ndarray = data["first_sdg"].values

    def __len__(self):
        return len(self.abstracts)

    def __getitem__(self, index):
        abstract_text = str(self.abstracts[index])
        abstract_text = " ".join(abstract_text.split())

        title_text = str(self.titles[index])
        title_text = " ".join(title_text.split())

        abstract_inputs = self.tokenizer.encode_plus(
            abstract_text,
            None,
            add_special_tokens=True,
            max_length=MAX_LENGTH,
            return_token_type_ids=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        abstract_ids = abstract_inputs['input_ids']
        abstract_mask = abstract_inputs['attention_mask']
        abstract_token_type_ids = abstract_inputs["token_type_ids"]

        title_inputs = self.tokenizer.encode_plus(
            title_text,
            None,
            add_special_tokens=True,
            max_length=MAX_LENGTH,
            return_token_type_ids=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        title_ids = title_inputs['input_ids']
        title_mask = title_inputs['attention_mask']
        title_token_type_ids = title_inputs["token_type_ids"]

        return {
            'abstract_ids': abstract_ids,
            'abstract_mask': abstract_mask,
            'abstract_token_type_ids': abstract_token_type_ids,
            'title_ids': title_ids,
            'title_mask': title_mask,
            'title_token_type_ids': title_token_type_ids,
            'label': torch.tensor(self.labels[index], dtype=torch.long)
        }

In [None]:
train_dataset = MyDataset(train)
test_dataset = MyDataset(test)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [None]:
TRAIN_BATCH_SIZE = 1
VALID_BATCH_SIZE = 1

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(train_dataset, **train_params)
testing_loader = DataLoader(test_dataset, **test_params)

In [None]:
class Classifier(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base", output_attentions=True, return_dict=True)
        self.pre_classifier = torch.nn.Linear(768 * 2, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 18)
    
    def forward(self, batch):
        abstract_ids = torch.reshape(batch["abstract_ids"].to(device, dtype = torch.long), (TRAIN_BATCH_SIZE,MAX_LENGTH))
        abstract_mask = torch.reshape(batch["abstract_mask"].to(device, dtype = torch.long), (TRAIN_BATCH_SIZE,MAX_LENGTH))
        abstract_token_type_ids = torch.reshape(batch["abstract_token_type_ids"].to(device, dtype = torch.long), (TRAIN_BATCH_SIZE,MAX_LENGTH))
        title_ids = torch.reshape(batch["title_ids"].to(device, dtype = torch.long), (TRAIN_BATCH_SIZE,MAX_LENGTH))
        title_mask = torch.reshape(batch["title_mask"].to(device, dtype = torch.long), (TRAIN_BATCH_SIZE,MAX_LENGTH))
        title_token_type_ids = torch.reshape(batch["title_token_type_ids"].to(device, dtype = torch.long), (TRAIN_BATCH_SIZE,MAX_LENGTH))
        X1 = self.roberta(input_ids=abstract_ids, attention_mask=abstract_mask, token_type_ids=abstract_token_type_ids)
        abstract_attentions = X1.attentions
        X1_pooler = X1.pooler_output
        X2 = self.roberta(input_ids=title_ids, attention_mask=title_mask, token_type_ids=title_token_type_ids)
        title_attentions = X2.attentions
        X2_pooler = X2.pooler_output
        X = torch.cat([X1_pooler, X2_pooler], dim=1)
        output = self.pre_classifier(X)
        output = torch.nn.ReLU()(output)
        output = self.dropout(output)
        output = self.linear(output)
        return abstract_attentions, title_attentions, output

In [None]:
model = Classifier()
model = model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




In [None]:
LEARNING_RATE = 1e-05
EPOCHS = 3
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [None]:
loss_list=[]
model.train()
for epoch in range(EPOCHS):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    for index, batch in tqdm(enumerate(training_loader, 0)):
        label = batch["label"].to(device, dtype = torch.long)
        optimizer.zero_grad()
        _, _, output = model(batch)
        loss = criterion(output, label)
        tr_loss += loss.item()
        prob = torch.nn.functional.softmax(output, dim=1)
        big_val, big_idx = prob.topk(1, dim=1)
        n_correct += calcuate_accuracy(big_idx, label)
        nb_tr_steps += 1
        nb_tr_examples+=label.size(0)
        if index % 500 == 0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 500 steps: {loss_step}")
            print(f"Training Accuracy per 500 steps: {accu_step}")
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_list.append(loss.data)
    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

1it [00:00,  2.93it/s]

Training Loss per 500 steps: 2.7877635955810547
Training Accuracy per 500 steps: 100.0


502it [00:57,  8.04it/s]

Training Loss per 500 steps: 2.3376013446234896
Training Accuracy per 500 steps: 35.12974051896207


1002it [01:54,  8.58it/s]

Training Loss per 500 steps: 2.072864635297647
Training Accuracy per 500 steps: 42.05794205794206


1502it [02:51,  8.35it/s]

Training Loss per 500 steps: 1.9009937654577598
Training Accuracy per 500 steps: 46.10259826782145


2002it [03:48,  8.50it/s]

Training Loss per 500 steps: 1.7687338072004197
Training Accuracy per 500 steps: 49.87506246876562


2502it [04:46,  8.83it/s]

Training Loss per 500 steps: 1.686288415881448
Training Accuracy per 500 steps: 52.059176329468215


3002it [05:43,  8.84it/s]

Training Loss per 500 steps: 1.601691590975852
Training Accuracy per 500 steps: 54.115294901699436


3502it [06:41,  8.90it/s]

Training Loss per 500 steps: 1.5386672501469025
Training Accuracy per 500 steps: 55.75549842902028


4002it [07:38,  9.06it/s]

Training Loss per 500 steps: 1.5039984967566093
Training Accuracy per 500 steps: 56.635841039740065


4502it [08:36,  8.47it/s]

Training Loss per 500 steps: 1.4766145197223972
Training Accuracy per 500 steps: 57.25394356809598


5002it [09:34,  8.78it/s]

Training Loss per 500 steps: 1.449434447794083
Training Accuracy per 500 steps: 58.068386322735456


5489it [10:30,  8.71it/s]
1it [00:00,  8.82it/s]

The Total Accuracy for Epoch 0: 58.69921661504828
Training Loss Epoch: 1.423986596341733
Training Accuracy Epoch: 58.69921661504828
Training Loss per 500 steps: 0.6719022989273071
Training Accuracy per 500 steps: 100.0


502it [00:58,  8.84it/s]

Training Loss per 500 steps: 1.0208849784139238
Training Accuracy per 500 steps: 71.25748502994011


1002it [01:55,  8.86it/s]

Training Loss per 500 steps: 0.9691805191939096
Training Accuracy per 500 steps: 72.32767232767233


1502it [02:53,  8.36it/s]

Training Loss per 500 steps: 0.9549945043790825
Training Accuracy per 500 steps: 72.618254497002


2002it [03:51,  8.21it/s]

Training Loss per 500 steps: 0.9487617905254009
Training Accuracy per 500 steps: 72.76361819090455


2502it [04:48,  8.79it/s]

Training Loss per 500 steps: 0.9605420893112175
Training Accuracy per 500 steps: 72.05117952818873


3002it [05:45,  9.02it/s]

Training Loss per 500 steps: 0.9457450070676308
Training Accuracy per 500 steps: 72.14261912695768


3502it [06:43,  8.63it/s]

Training Loss per 500 steps: 0.9562263081514752
Training Accuracy per 500 steps: 71.89374464438731


4002it [07:40,  8.61it/s]

Training Loss per 500 steps: 0.9556686492560916
Training Accuracy per 500 steps: 71.65708572856786


4502it [08:37,  8.91it/s]

Training Loss per 500 steps: 0.9599269283252609
Training Accuracy per 500 steps: 71.49522328371474


5002it [09:34,  9.07it/s]

Training Loss per 500 steps: 0.9576256575901453
Training Accuracy per 500 steps: 71.56568686262747


5489it [10:31,  8.70it/s]
1it [00:00,  8.98it/s]

The Total Accuracy for Epoch 1: 71.8345782474039
Training Loss Epoch: 0.9488254298666744
Training Accuracy Epoch: 71.8345782474039
Training Loss per 500 steps: 0.18894918262958527
Training Accuracy per 500 steps: 100.0


502it [00:57,  8.79it/s]

Training Loss per 500 steps: 0.8213807470880881
Training Accuracy per 500 steps: 76.84630738522954


1002it [01:55,  8.63it/s]

Training Loss per 500 steps: 0.8189492940662669
Training Accuracy per 500 steps: 76.92307692307692


1502it [02:52,  8.98it/s]

Training Loss per 500 steps: 0.7878236593663082
Training Accuracy per 500 steps: 77.28181212524983


2002it [03:50,  8.99it/s]

Training Loss per 500 steps: 0.7895250529667985
Training Accuracy per 500 steps: 77.11144427786107


2502it [04:47,  9.03it/s]

Training Loss per 500 steps: 0.7750543003316458
Training Accuracy per 500 steps: 77.56897241103559


3002it [05:45,  8.71it/s]

Training Loss per 500 steps: 0.7622447488910957
Training Accuracy per 500 steps: 77.70743085638121


3502it [06:43,  9.06it/s]

Training Loss per 500 steps: 0.7599869849682637
Training Accuracy per 500 steps: 77.83490431305341


4002it [07:40,  8.75it/s]

Training Loss per 500 steps: 0.7526955410629912
Training Accuracy per 500 steps: 77.80554861284679


4502it [08:37,  8.17it/s]

Training Loss per 500 steps: 0.7588527450671448
Training Accuracy per 500 steps: 77.44945567651634


5002it [09:35,  8.69it/s]

Training Loss per 500 steps: 0.7651990653764587
Training Accuracy per 500 steps: 77.18456308738253


5489it [10:31,  8.70it/s]

The Total Accuracy for Epoch 2: 77.20896338130807
Training Loss Epoch: 0.7711411210868914
Training Accuracy Epoch: 77.20896338130807





In [None]:
torch.save(model, f"{ROOT_PATH}/Ordenado/with_titles/2020-11-09_roberta_original_with_titles.pt")

In [None]:
model.eval()
tr_loss = 0
n_correct = 0
nb_tr_steps = 0
nb_tr_examples = 0
y_pred = []
y_true = []
y_all_probs = []
with torch.no_grad():
    for batch in testing_loader:
        label = batch["label"].to(device, dtype = torch.long)
        y_true.append(label.cpu().numpy()[0])
        optimizer.zero_grad()
        _, _, output = model(batch)
        loss = criterion(output, label)
        tr_loss += loss.item()
        prob = torch.nn.functional.softmax(output, dim=1)
        _, big_idx = prob.topk(1, dim=1)
        _, all_ordered = prob.topk(18, dim=1)
        y_pred.append(big_idx.cpu().numpy()[0][0])
        y_all_probs.append(all_ordered[0][:-1].cpu().numpy().tolist())

In [None]:
y_true == test["first_sdg"].tolist()

True

In [None]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        13
           2       0.62      0.45      0.52        71
           3       0.90      0.89      0.90       771
           4       0.80      0.87      0.83       201
           5       0.73      0.80      0.77       122
           6       0.55      0.79      0.65        91
           7       0.58      0.76      0.66       109
           8       0.50      0.25      0.33        32
           9       0.64      0.32      0.42       136
          10       0.83      0.62      0.71        79
          11       0.70      0.73      0.71       137
          12       0.35      0.38      0.37       106
          13       0.58      0.50      0.54       116
          14       0.58      0.81      0.68        59
          15       0.63      0.66      0.65       149
          16       0.66      0.75      0.70       109
          17       0.34      0.27      0.30        45

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
y_true = np.asarray(y_true)
y_pred = np.asarray(y_pred)

misclassified = np.where(y_true != y_pred)[0]

In [None]:
len(misclassified)

657

In [None]:
print(test.iloc[misclassified[0]]["title"])
print(test.iloc[misclassified[0]]["clean_abstract"])
print(f"True label: {test.iloc[misclassified[0]]['first_sdg']}")
print(f"Predicted label: {y_pred[misclassified[0]]}")
print(f"Second most probable label: {y_all_probs[misclassified[0]][1]}")

Fgf10 And Sox9 Are Essential For The
Establishment Of Distal Progenitor Cells During Mouse Salivary Gland Development
Salivary glands are formed by branching morphogenesis with epithelial progenitors forming a network of ducts and acini (secretory cells). During this process, epithelial progenitors specialise into distal (tips of the gland) and proximal (the stalk region) identities that produce the acini and higher order ducts, respectively. Little is known about the factors that regulate progenitor expansion and specialisation in the different parts of the gland. Here, we show that Sox9 is involved in establishing the identity of the distal compartment before the initiation of branching morphogenesis. Sox9 is expressed throughout the gland at the initiation stage before becoming restricted to the distal epithelium from the bud stage and throughout branching morphogenesis. Deletion of Sox9 in the epithelium results in loss of the distal epithelial progenitors, a reduction in prolifera

In [None]:
print(test.iloc[misclassified[-1]]["title"])
print(test.iloc[misclassified[0]]["clean_abstract"])
print(f"True label: {test.iloc[misclassified[-1]]['first_sdg']}")
print(f"Predicted label: {y_pred[misclassified[-1]]}")
print(f"Second most probable label: {y_all_probs[misclassified[-1]][1]}")

Program For The Development Of Attitudes Of Equality Of Gender In Classes Of Physical Education In Schools
Salivary glands are formed by branching morphogenesis with epithelial progenitors forming a network of ducts and acini (secretory cells). During this process, epithelial progenitors specialise into distal (tips of the gland) and proximal (the stalk region) identities that produce the acini and higher order ducts, respectively. Little is known about the factors that regulate progenitor expansion and specialisation in the different parts of the gland. Here, we show that Sox9 is involved in establishing the identity of the distal compartment before the initiation of branching morphogenesis. Sox9 is expressed throughout the gland at the initiation stage before becoming restricted to the distal epithelium from the bud stage and throughout branching morphogenesis. Deletion of Sox9 in the epithelium results in loss of the distal epithelial progenitors, a reduction in proliferation and a 

In [None]:
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py


def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [None]:
test["predicted"] = y_pred
test["all_probs"] = y_all_probs

In [None]:
actual = [[pred] for pred in test["first_sdg"]]

In [None]:
predicted = test["all_probs"].tolist()

In [None]:
for i in range(2, 6):
    print(f"MAP@{i} = {mapk(actual, predicted, i)}")

MAP@2 = 0.7794117647058824
MAP@3 = 0.795751633986928
MAP@4 = 0.8031045751633986
MAP@5 = 0.8085606706450695


In [None]:
def correct_label_not_in_k_top_predicted(row: pd.Series, k: int) -> bool:
    return row["first_sdg"] in row["all_probs"][:k]

In [None]:
for i in range(2, 6):
    test[f"sdg_in_{i}"] = test.apply(lambda row: correct_label_not_in_k_top_predicted(row, i), axis=1)
    missing = len(test[test[f"sdg_in_{i}"].eq(False)])
    print(f"{missing}/{len(test)} ({missing / len(test) * 100}%) with correct label not into the first {i} labels predicted")

378/2346 (16.11253196930946%) with correct label not into the first 2 labels predicted
263/2346 (11.210571184995738%) with correct label not into the first 3 labels predicted
194/2346 (8.269394714407502%) with correct label not into the first 4 labels predicted
130/2346 (5.541346973572037%) with correct label not into the first 5 labels predicted


In [None]:
test.to_json(f"{ROOT_PATH}/Ordenado/with_titles/2020-11-11_roberta_test.json", orient="records", force_ascii=False)