In [1]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split
import pandas as pd
import numpy as np
from datasets import load_metric
from datasets import Dataset
import torch
import transformers
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import AutoModelForSequenceClassification,AutoTokenizer

2022-06-21 10:39:16.794917: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cluster/apps/gcc-6.3.0/openblas-0.2.20-cot3cawsqf4pkxjwzjexaykbwn2ch3ii/lib:/cluster/apps/nss/gcc-6.3.0/python/3.7.4/x86_64/lib64:/cluster/spack/apps/linux-centos7-x86_64/gcc-4.8.5/gcc-6.3.0-sqhtfh32p5gerbkvi5hih7cfvcpmewvj/lib64:/cluster/spack/apps/linux-centos7-x86_64/gcc-4.8.5/gcc-6.3.0-sqhtfh32p5gerbkvi5hih7cfvcpmewvj/lib:/cluster/apps/lsf/10.1/linux2.6-glibc2.3-x86_64/lib
2022-06-21 10:39:16.794953: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = accuracy_score(y_true=labels, y_pred=predictions)
    recall = recall_score(y_true=labels, y_pred=predictions)
    precision = precision_score(y_true=labels, y_pred=predictions)
    recall_w = recall_score(y_true=labels, y_pred=predictions,average='weighted')
    precision_w = precision_score(y_true=labels, y_pred=predictions,average='weighted')
    f1 = f1_score(y_true=labels, y_pred=predictions)
    f1_pos = f1_score(y_true=labels, y_pred=predictions,average='binary',pos_label=1)
    f1_micro = f1_score(y_true=labels, y_pred=predictions,average='micro')
    f1_weighted = f1_score(y_true=labels, y_pred=predictions,average='weighted')
    return {"accuracy": accuracy, "precision": precision, "recall": recall,
             "precision_w": precision_w, "recall_w": recall_w,
             "f1": f1,"f1_pos": f1_pos,
            "f1_micro": f1_micro,"f1_weighted": f1_weighted} 


In [3]:
def model_init():
    transformers.set_seed(42)
    return AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2,
                                                             return_dict=True)

In [4]:
def train(tokenizer,dataset):
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    # Create a 80-20 train-validation split.
    # Calculate the number of samples to include in each set.
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    torch.manual_seed(42)
    # Divide the dataset by randomly selecting samples.
    train_dataset, val_dataset = random_split(tokenized_datasets, [train_size, val_size])


    training_args = TrainingArguments(output_dir="model2", evaluation_strategy="epoch",
                                 per_device_train_batch_size=16,per_device_eval_batch_size=16,
                                 seed=42,num_train_epochs=5,auto_find_batch_size=True,report_to="none")
    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    return trainer

In [None]:
def main():
    data_path="../../data/"
    outputs_path="../../outputs/"
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

    df=pd.read_csv(outputs_path+"sg_classifier/train_set_labeled.csv")
    df['text']=df.title_abstract
    df=df.loc[:,['ID','text','label']]
    dataset = Dataset.from_pandas(df)
    
    trainer=train(tokenizer,dataset)
    trainer.save_model("model_finetuned/")
    
if __name__ == '__main__':
    main()

  0%|          | 0/36 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /cluster/home/fgonzalez/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.4",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file https://huggingface.co/bert-b

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 28151
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 8800
The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, ID. If text, ID are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Precision W,Recall W,F1,F1 Pos,F1 Micro,F1 Weighted
1,0.1257,0.104254,0.974851,0.997636,0.705686,0.975454,0.974851,0.826641,0.826641,0.974851,0.972864


Saving model checkpoint to model2/checkpoint-500
Configuration saved in model2/checkpoint-500/config.json
Model weights saved in model2/checkpoint-500/pytorch_model.bin
Saving model checkpoint to model2/checkpoint-1000
Configuration saved in model2/checkpoint-1000/config.json
Model weights saved in model2/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to model2/checkpoint-1500
Configuration saved in model2/checkpoint-1500/config.json
Model weights saved in model2/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 7038
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, ID. If text, ID are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


In [5]:
def main():
    data_path="../../data/"
    outputs_path="../../outputs/"
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

    df=pd.read_csv(outputs_path+"sg_classifier/train_set_labeled.csv")
    df['text']=df.title_abstract
    df=df.loc[:,['ID','text','label']]
    dataset = Dataset.from_pandas(df)
    
    trainer=train(tokenizer,dataset)
    trainer.save_model("model_finetuned/")
    
if __name__ == '__main__':
    main()

  0%|          | 0/36 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /cluster/home/fgonzalez/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.4",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file https://huggingface.co/bert-b

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /cluster/home/fgonzalez/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.4",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file https://huggingface.co/bert-b

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: wandb version 0.12.18 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
2022-06-21 03:10:29.261932: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cluster/apps/gcc-6.3.0/openblas-0.2.20-cot3cawsqf4pkxjwzjexaykbwn2ch3ii/lib:/cluster/apps/nss/gcc-6.3.0/python/3.7.4/x86_64/lib64:/cluster/spack/apps/linux-centos7-x86_64/gcc-4.8.5/gcc-6.3.0-sqhtfh32p5gerbkvi5hih7cfvcpmewvj/lib64:/cluster/spack/apps/linux-centos7-x86_64/gcc-4.8.5/gcc-6.3.0-sqhtfh32p5gerbkvi5hih7cfvcpmewvj/lib:/cluster/apps/lsf/10.1/linux2.6-glibc2.3-x86_64/lib
2022-06-21 03:10:29.261974: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, ID. If text, ID are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Precision W,Recall W,F1,F1 Pos,F1 Micro,F1 Weighted
1,0.1272,0.095379,0.980108,0.961694,0.797659,0.979821,0.980108,0.872029,0.872029,0.980108,0.979259
2,0.0753,0.07246,0.983376,0.965184,0.834448,0.98315,0.983376,0.895067,0.895067,0.983376,0.982824
3,0.0645,0.068435,0.985223,0.962547,0.859532,0.985,0.985223,0.908127,0.908127,0.985223,0.984842
4,0.0422,0.069009,0.98636,0.95471,0.881271,0.986135,0.98636,0.916522,0.916522,0.98636,0.986111
5,0.0361,0.064048,0.987781,0.952297,0.901338,0.987605,0.987781,0.926117,0.926117,0.987781,0.987628


Saving model checkpoint to model/checkpoint-500
Configuration saved in model/checkpoint-500/config.json
Model weights saved in model/checkpoint-500/pytorch_model.bin
Saving model checkpoint to model/checkpoint-1000
Configuration saved in model/checkpoint-1000/config.json
Model weights saved in model/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to model/checkpoint-1500
Configuration saved in model/checkpoint-1500/config.json
Model weights saved in model/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 7038
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, ID. If text, ID are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
Saving model checkpoint to model/checkpoint-2000
Configuration saved in model/checkpoint-2000/config.json
Model weights saved in model/checkpoint-2000/pytorch_