# Hugging Face model card for DistilBERT uncased text classification model: ORO branch classification

This script will use the model configuration with the hyperparameters determined from 05_Multilabel_1_oro_branch directory, and fit using the whole dataset. This is different than the model predictions obtained from the nested cross validation script which fits models on splits of the data for a distribution of predicitons. This is because a model card can only have one model. So the purpose is just to provide an approximation/example, knowing that the model will likely be overfit compared to the predictions presented in the paper. 

In [1]:
# Load modules
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import pandas as pd
#import os
#os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"
#os.environ["TF_CPP_VMODULE"]="gpu_process_state=10,gpu_cudamallocasync_allocator=10"
import tensorflow as tf
a = tf.zeros([], tf.float32)
import tensorflow_addons as tfa
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm
2024-03-12 15:39:53.844710: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-12 15:39:53.967765: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-12 15:39:54.493757: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-03-12 15:39:54.493809: W tensorflow/compiler/xla/s

## Get the best model parameters determined from the model selection CV

In [2]:
# Define which model, tokenizer that will be used to fit the model
MODEL_NAME = 'distilbert-base-uncased'

tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)


In [3]:
# Collect all the parameters from the model selection
inner_scores = []
params = ['batch_size','weight_decay','learning_rate','num_epochs','class_weight']

for k in range(3):
    inner_df = pd.read_csv(f'/home/dveytia/ORO-map-relevance/outputs/model_selection/oro_branch_model_selection_{k}.csv')
    inner_df = inner_df.sort_values('F1 macro',ascending=False).reset_index(drop=True)
    inner_scores += inner_df.to_dict('records')

In [4]:
# From all the folds used for model selection, find the best model
inner_scores = pd.DataFrame.from_dict(inner_scores).fillna(-1)
inner_scores['F1 - tp'] = inner_scores.loc[:, [col for col in inner_scores.columns if col.startswith('F1 -')]].mean(axis=1) #and any(target in col for target in targets)

best_model_params = (inner_scores
              .groupby(params)['F1 - tp'] # This is the same as groupig through F1-macro
              .mean()
              .sort_values(ascending=False)
              .reset_index() 
             ).to_dict('records')[0]

del best_model_params['F1 - tp']
print(best_model_params)

#if best_model_params['class_weight']==-1:
#    best_model_params['class_weight']=None
#else:
#    best_model_params['class_weight'] = ast.literal_eval(best_model_params['class_weight'])


{'batch_size': 16, 'weight_decay': 0.0, 'learning_rate': 1e-05, 'num_epochs': 3, 'class_weight': -1}


## Using the best parameters, fit the model on the full seen dataset

In [5]:
## Read in and Format the oro_branch coding data 

## The 'seen' data
codedVariablesTxt = '/home/dveytia/ORO-map-relevance/data/seen/all-coding-format-distilBERT-simplifiedMore.txt'
screenDecisionsTxt = '/home/dveytia/ORO-map-relevance/data/seen/all-screen-results_screenExcl-codeIncl.txt'

df = pd.read_csv(codedVariablesTxt, delimiter='\t')
df = df.rename(columns={'analysis_id':'id'})

screendf = pd.read_csv(screenDecisionsTxt, delimiter='\t')
screendf = screendf.query('include_screen==1')
screendf = screendf.rename(columns={'include_screen':'relevant','analysis_id':'id'})

df = df.merge(screendf[['id', 'sample_screen']], on='id', how='left')

def map_values(x):
    if x == "random":
        return 1
    elif x == "relevance sort":
        return 0
    elif x == "test list":
        return 0
    elif x == "supplemental coding":
        return 0
    else:
        return "NaN"

df['random_sample']=df['sample_screen'].apply(map_values)

df = (df
      .sort_values('id')
      .sample(frac=1, random_state=1)
      .reset_index(drop=True)
)

df['text'] = df['title'] + ". " + df['abstract'] + " " + "Keywords: " + df["keywords"]
df['text'] = df.apply(lambda row: (row['title'] + ". " + row['abstract']) if pd.isna(row['text']) else row['text'], axis=1)


In [6]:
## Format target columns as labels for the model
targets = [x for x in df.columns if "oro_branch" in x] #Only need to change here, "data_type" for another variable
print(targets)

df['labels'] = list(df[targets].values)
df['labels'].head()

['oro_branch.Mitigation', 'oro_branch.Nature', 'oro_branch.Societal']


0    [0, 0, 1]
1    [1, 0, 0]
2    [1, 0, 0]
3    [0, 1, 0]
4    [0, 0, 1]
Name: labels, dtype: object

In [7]:
## Convert pandas data frame to Dataset
## separate into training (non-randomly sampled) and testing (randomly sampled)

from datasets import Dataset, DatasetDict

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


train_datasets = Dataset.from_pandas(df.loc[df['random_sample'] == 0, ['text','labels','random_sample']])
eval_datasets = Dataset.from_pandas(df.loc[df['random_sample'] == 1, ['text','labels','random_sample']])

train_tokenized = train_datasets.map(tokenize_function, batched=True)
eval_tokenized = eval_datasets.map(tokenize_function, batched=True)

Map: 100%|████████████████████████████| 529/529 [00:02<00:00, 199.25 examples/s]
Map: 100%|████████████████████████████| 427/427 [00:02<00:00, 204.02 examples/s]


In [8]:
# Convert Dataset to big tensors and use the tf.data.Dataset.from_tensor_slices method
full_train_dataset = train_tokenized
full_eval_dataset = eval_tokenized

tf_train_dataset = full_train_dataset.remove_columns(["text"]).with_format("tensorflow")
train_features = {x: tf_train_dataset[x] for x in tokenizer.model_input_names}
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, tf_train_dataset['labels']))
train_tf_dataset = train_tf_dataset.shuffle(len(tf_train_dataset)).batch(2) ## reduce batch size

tf_eval_dataset = full_eval_dataset.remove_columns(["text"]).with_format("tensorflow")
eval_features = {x: tf_eval_dataset[x] for x in tokenizer.model_input_names}
eval_tf_dataset = tf.data.Dataset.from_tensor_slices((eval_features, tf_eval_dataset['labels']))
eval_tf_dataset = eval_tf_dataset.shuffle(len(tf_eval_dataset)).batch(2) ## reduce batch size


2024-03-12 15:40:02.979120: I tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:306] New Peak memory usage of 2167820 bytes.
2024-03-12 15:40:02.979149: I tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:315] gpu_async_0 Allocated 2166784 at 0x302000a00
2024-03-12 15:40:02.981729: I tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:306] New Peak memory usage of 4334604 bytes.
2024-03-12 15:40:02.981748: I tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:315] gpu_async_0 Allocated 2166784 at 0x302211a00
2024-03-12 15:40:02.983327: I tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:306] New Peak memory usage of 4347300 bytes.
2024-03-12 15:40:02.983343: I tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:315] gpu_async_0 Allocated 12696 at 0x302422a00
2024-03-12 15:40:02.994264: I tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:306] New Peak memory usage of 4

In [9]:
# With this, the model can be compiled and trained 

# define model using best parameters gotten from model selection
num_labels = 3 # three oro branch labels
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', 
                                                              num_labels=num_labels,
                                                              id2label={0: 'Mitigation', 1: 'Natural', 2:'Societal'})  

optimizer = tfa.optimizers.AdamW(learning_rate=best_model_params['learning_rate'], weight_decay=best_model_params['weight_decay'])
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()
model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics
)

# Fit model using training and evaluation datasets
model.fit(train_tf_dataset, validation_data=eval_tf_dataset, epochs=best_model_params['num_epochs']),

2024-03-12 15:40:03.205609: I tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:306] New Peak memory usage of 7855573 bytes.
2024-03-12 15:40:03.205628: I tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:315] gpu_async_0 Allocated 8 at 0x30277f000
2024-03-12 15:40:03.205946: I tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:306] New Peak memory usage of 7855581 bytes.
2024-03-12 15:40:03.205960: I tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:315] gpu_async_0 Allocated 8 at 0x30277f200
2024-03-12 15:40:03.206915: I tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:306] New Peak memory usage of 7855585 bytes.
2024-03-12 15:40:03.206930: I tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:315] gpu_async_0 Allocated 4 at 0x30277f400
2024-03-12 15:40:03.207056: I tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:306] New Peak memory usage of 7855589 bytes.
20

ResourceExhaustedError: Exception encountered when calling layer 'ffn' (type TFFFN).

{{function_node __wrapped__AddV2_device_/job:localhost/replica:0/task:0/device:GPU:0}} failed to allocate memory [Op:AddV2]

Call arguments received by layer 'ffn' (type TFFFN):
  • input=tf.Tensor(shape=(1, 2, 768), dtype=float32)
  • training=False

In [None]:
#model.push_to_hub("distilbert_ORO_Branch", use_auth_token = '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')