# Train deep learning sentiment model

## Check setup

In [1]:
import torch

# At least 1 gpu is needed for this to run reasonably quickly.
print("PyTorch version:", torch.__version__)
print("GPU available:", torch.cuda.is_available())
print("GPU device count:", torch.cuda.device_count())
print("CUDA version:", torch.version.cuda)

PyTorch version: 1.10.1
GPU available: True
GPU device count: 2
CUDA version: 11.3


## Import labels

In [2]:
import pyprojroot
data_file = pyprojroot.here() / "data-raw/sentence-labels.csv"
import os
print("Found labels file:", os.path.exists(data_file))

import pandas as pd
df = pd.read_csv(data_file)
df.info()

from collections import Counter
print(Counter(df['overall-sentiment']))

# We have a few nan values here.
df.dropna(subset = ['overall-sentiment'], inplace = True)

print(Counter(df['overall-sentiment']))

Found labels file: True
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1499 entries, 0 to 1498
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   text               1499 non-null   object
 1   aspect             1387 non-null   object
 2   overall-sentiment  1493 non-null   object
 3   aspect1-sentiment  1386 non-null   object
 4   aspect2-sentiment  556 non-null    object
 5   aspect3-sentiment  207 non-null    object
 6   aspect4-sentiment  96 non-null     object
 7   aspect5-sentiment  45 non-null     object
dtypes: object(8)
memory usage: 93.8+ KB
Counter({'Positive': 649, 'Negative': 540, 'Neutral': 232, 'Very Positive': 45, 'Very Negative': 27, nan: 6})
Counter({'Positive': 649, 'Negative': 540, 'Neutral': 232, 'Very Positive': 45, 'Very Negative': 27})


## Create training/test splits

In [3]:
# Create training and test splits.

import numpy as np
np.random.seed(1)

shuffle = np.random.permutation(df.index)
n_train = int(len(df) * 0.8)
df['split'] = ""
# use 'n_train' samples for training and the rest for testing
train_ids = shuffle[:n_train]
test_ids = shuffle[n_train:]

df.loc[train_ids, "split"] = "train"
df.loc[test_ids, "split"] = "test"

df['split'] = df['split'].astype("category")

from collections import Counter
print(Counter(df['split']))

Counter({'train': 1194, 'test': 299})


## Export training/test files

In [4]:
# Export into files.
data_dir = pyprojroot.here() / "data"
train_file = data_dir / "labels-train.csv"
test_file = data_dir / "labels-test.csv"

df2 = df.rename(columns = {'overall-sentiment': 'labels'})
# We only need to export these two columns.
columns = ['text', 'labels']

df2.loc[df2['split'] == 'train', columns].to_csv(train_file, index = False)
df2.loc[df2['split'] == 'test', columns].to_csv(test_file, index = False)

## Load dataset

In [5]:
from datasets import load_dataset
import datasets

# Via https://huggingface.co/docs/datasets/loading_datasets.html#from-local-or-remote-files
features = datasets.Features({
                          'text': datasets.Value('string'),
                          'labels': datasets.Value('string')
                      })

dataset = load_dataset('csv',
                       data_files = {
                           'train': str(train_file),
                           'test': str(test_file)
                       },
                       skiprows = 1, # Otherwise it will treat the header as an observation.
                       column_names = ['text', 'labels'], # Not needed, but in case we have extra columns.
                       features = features)
print(dataset)

Using custom data configuration default-0ff93346614de202


Downloading and preparing dataset csv/default to /home/ck37/.cache/huggingface/datasets/csv/default-0ff93346614de202/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/ck37/.cache/huggingface/datasets/csv/default-0ff93346614de202/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 1194
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 299
    })
})


## Specify model and associated tokenizer

In [6]:
model_name = "microsoft/deberta-v3-large"

import transformers
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Transformers version:", transformers.__version__)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Transformers version: 4.16.2


## Tokenize dataset

In [7]:
%%time

# Via https://discuss.huggingface.co/t/converting-string-label-to-int/2816/2

class_names = ['Very Negative', 'Negative', 'Neutral', 'Positive', 'Very Positive']
labels = datasets.ClassLabel(names = class_names)
labels

def tokenize(batch):
    tokens = tokenizer(batch['text'], truncation = True, padding = True, max_length = 256)
    result = labels.str2int(batch['labels'])
    tokens['labels'] = result
    return tokens

tokenized_datasets = dataset.map(tokenize, batched = True, num_proc = 2)

# Remove any extra columns to avoid a warning when training, not essential though.
tokenized_datasets = tokenized_datasets.remove_columns(['text'])
tokenized_datasets.set_format('torch')

CPU times: user 1.79 s, sys: 32.2 ms, total: 1.82 s
Wall time: 3.03 s


## Load pretrained model

In [8]:
from transformers import AutoConfig, AutoModelForSequenceClassification

# TODO: make this quieter.
# num_labels appears to really mean num_classes
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           problem_type = "single_label_classification",
                                                           num_labels = len(class_names))

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

## Train model

This will automatically use any GPUs, provided that CUDA and pytorch are installed correctly.

In [9]:
%%time

from transformers import TrainingArguments

# Via https://www.thepythoncode.com/code/finetuning-bert-using-huggingface-transformers-python
training_args = \
    TrainingArguments(
                      evaluation_strategy = "steps",
                      warmup_steps = 100,
                      num_train_epochs = 5, # Only need ~350 steps to converge using default LR, warmup of 100, etc.
                      output_dir = './models/results', 
                      logging_dir = './models/logs',
                      #weight_decay = 0.0000001,
                      #learning_rate = 0.01,
                      #learning_rate = 1e-2,
                      load_best_model_at_end = True, # load the best model when finished training (default metric is loss)
                      logging_steps = 25)

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions = predictions, references = labels)

from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['test'],
    compute_metrics = compute_metrics,
)

trainer.train()

***** Running training *****
  Num examples = 1194
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 375


Step,Training Loss,Validation Loss,Accuracy
25,1.4617,1.218407,0.434783
50,1.2825,1.144751,0.458194
75,1.1684,1.131506,0.48495
100,1.1173,0.944533,0.598662
125,0.919,0.842348,0.712375
150,0.9109,0.79078,0.732441
175,0.8319,0.815294,0.742475
200,0.7038,0.728635,0.769231
225,0.7711,0.676031,0.752508
250,0.5614,0.694619,0.769231


***** Running Evaluation *****
  Num examples = 299
  Batch size = 16
***** Running Evaluation *****
  Num examples = 299
  Batch size = 16
***** Running Evaluation *****
  Num examples = 299
  Batch size = 16
***** Running Evaluation *****
  Num examples = 299
  Batch size = 16
***** Running Evaluation *****
  Num examples = 299
  Batch size = 16
***** Running Evaluation *****
  Num examples = 299
  Batch size = 16
***** Running Evaluation *****
  Num examples = 299
  Batch size = 16
***** Running Evaluation *****
  Num examples = 299
  Batch size = 16
***** Running Evaluation *****
  Num examples = 299
  Batch size = 16
***** Running Evaluation *****
  Num examples = 299
  Batch size = 16
***** Running Evaluation *****
  Num examples = 299
  Batch size = 16
***** Running Evaluation *****
  Num examples = 299
  Batch size = 16
***** Running Evaluation *****
  Num examples = 299
  Batch size = 16
***** Running Evaluation *****
  Num examples = 299
  Batch size = 16
***** Running Evalua

CPU times: user 4min 59s, sys: 35.4 s, total: 5min 35s
Wall time: 3min 28s


TrainOutput(global_step=375, training_loss=0.8194707260131836, metrics={'train_runtime': 205.1923, 'train_samples_per_second': 29.095, 'train_steps_per_second': 1.828, 'total_flos': 2781862097034240.0, 'train_loss': 0.8194707260131836, 'epoch': 5.0})

In [10]:
# Evaluate the current model after training
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 299
  Batch size = 16


{'eval_loss': 0.7691320776939392,
 'eval_accuracy': 0.782608695652174,
 'eval_runtime': 2.7505,
 'eval_samples_per_second': 108.708,
 'eval_steps_per_second': 6.908,
 'epoch': 5.0}

## Save model

In [11]:
# Save model and tokenizer
model_path = str(pyprojroot.here() / "models/deberta-v3")
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

# Also save trainer for future usage.
#trainer.save_model(model_path)
#trainer.save_state()

Configuration saved in /home/ck37/projects/clinical-sentiment-keywords/models/deberta-v3/config.json
Model weights saved in /home/ck37/projects/clinical-sentiment-keywords/models/deberta-v3/pytorch_model.bin
tokenizer config file saved in /home/ck37/projects/clinical-sentiment-keywords/models/deberta-v3/tokenizer_config.json
Special tokens file saved in /home/ck37/projects/clinical-sentiment-keywords/models/deberta-v3/special_tokens_map.json
added tokens file saved in /home/ck37/projects/clinical-sentiment-keywords/models/deberta-v3/added_tokens.json


('/home/ck37/projects/clinical-sentiment-keywords/models/deberta-v3/tokenizer_config.json',
 '/home/ck37/projects/clinical-sentiment-keywords/models/deberta-v3/special_tokens_map.json',
 '/home/ck37/projects/clinical-sentiment-keywords/models/deberta-v3/spm.model',
 '/home/ck37/projects/clinical-sentiment-keywords/models/deberta-v3/added_tokens.json')

## Predict on new data

In [12]:
# Text should be a single string, not a vector currently.
def predict_sentiment(text, max_length = 512):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return class_names[probs.argmax()]

In [13]:
print(predict_sentiment("This is a test sentence."))
print(predict_sentiment("I'm worried that the patient is doing poorly."))
print(predict_sentiment("I'm extremely worried that the patient is doing terribly and will certainly die soon."))
print(predict_sentiment("Patient's bp is normalizing, and kidney function appears to be improving."))

Neutral
Negative
Negative
Positive


In [14]:
%%time

# Apply to full dataframe - takes 39 seconds, uses 1 GPU.
preds = df.text.apply(predict_sentiment)

CPU times: user 1min 5s, sys: 92.9 ms, total: 1min 5s
Wall time: 39.3 s


## Save predictions

In [15]:
df_preds = df

In [16]:
df_preds['predicted'] = preds

In [17]:
df_preds.to_excel("data/predicted-sentiment.xlsx")