## Imports and configs

In [3]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import os

from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig
from torch.utils.data import Dataset, DataLoader
import torch

In [4]:
config = {'model_name': 'google/bigbird-roberta-base', # From Huggingface's ModelHub.
          'model_save_path': './model/',
          'model_chkpt_path': './model/model_chkpt/',
          'max_length': 1024,
          'train_batch_size': 4,
          'valid_batch_size': 4,
          'epochs':10,
          'learning_rates': [2.5e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7],
          'max_grad_norm': 10,
          'device': 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'} # Added support for Apple Metal acceleration.

print(f"Training on {config['device']}")

Training on cuda


In [3]:
full_df = pd.read_pickle('../dataset.csv')
full_df.head()

Unnamed: 0_level_0,content,labels
id,Unnamed: 1_level_1,Unnamed: 2_level_1
73DC1D49FAD5,eletoral college can be a very good thing caus...,"[3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."
D840AC3957E5,"STUDENT_NAME\n\nADDRESS_NAME\n\nFebruary 22, 2...","[0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 4, 4, ..."
753E320B186B,In my opinion as a student: I don't agree at t...,"[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
C2ABDAC2BC2C,When it comes to at home learning and attendin...,"[3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."
B2DDBAAC084C,Y\n\nou can ask many different people for advi...,"[3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."


## Labels

Because we have 15k+ essays, each with hundreds of labeled words, we store the labels as integers rather than strings to drastically decrease disk and memory usage (see `preprocessing.ipynb` for how this was done). Additionally, we would have needed to do this for training regardless to build the one-hot arrays. These can be easily converted to/from their original string labels with the dictionaries below.

In [4]:
id2label = {
  0:  'Unnanotated',
  1:  'B-Lead',
  2:  'I-Lead',
  3:  'B-Position',
  4:  'I-Position',
  5:  'B-Evidence',
  6:  'I-Evidence',
  7:  'B-Claim',
  8:  'I-Claim',
  9:  'B-Concluding_Statement',
  10: 'I-Concluding_Statement',
  11: 'B-Counterclaim',
  12: 'I-Counterclaim',
  13: 'B-Rebuttal',
  14: 'I-Rebuttal'
}

label2id = {
  'Unnanotated': 0,
  'B-Lead': 1,
  'I-Lead': 2,
  'B-Position': 3,
  'I-Position': 4,
  'B-Evidence': 5,
  'I-Evidence': 6,
  'B-Claim': 7,
  'I-Claim': 8,
  'B-Concluding_Statement': 9,
  'I-Concluding_Statement': 10,
  'B-Counterclaim': 11,
  'I-Counterclaim': 12,
  'B-Rebuttal': 13,
  'I-Rebuttal': 14
}

## Dataset

Since we're using the PyTorch backend, it is convenient to define the torch Dataset (and later the Dataloader), so that the model can easily intake the data without any dependencies on how the we chose to store the data.

In [5]:
class EssayDataset(Dataset):
  def __init__(self, df, tokenizer, max_len, get_word_ids):
        self.len = len(df)
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.get_word_ids = get_word_ids # For validation.

  def __getitem__(self, index):
        essay_words = self.df.content[index].split() # Split essay by words before tokenizing.

        # Makes a dict with keys: input_ids, attention_mask.
        encoding = self.tokenizer(essay_words,
                             is_split_into_words=True, # Necessary to keep correspondance between words and labels contructed previously.
                             padding='max_length',
                             truncation=True,
                             max_length=self.max_len)
        """
        From Tokenizer's docs about word_ids:
            A list indicating the word corresponding to each token. Special tokens added by the tokenizer are mapped to None and other tokens
            are mapped to the index of their corresponding word (several tokens will be mapped to the same word index if they are parts of that word).

        This is needed to match the correct labels with the tokens, which may not have a 1:1 correspondence with the original words.
        """
        word_ids = encoding.word_ids()
        word_labels = None

        # If we're training, we want to know the labels corresponding to each word_id.
        if not self.get_word_ids:
            # Get original word label array for this essay.
            word_labels = self.df.labels[index] #[int(label) for label in self.df.labels[index].split()]
            label_ids = []

            # Correct for tokenization mismatch.
            for word_idx in word_ids:
                # 'None' means that this is a special/reserved token, mark as -100 to be ignored later in training.
                if word_idx is None:
                    label_ids.append(-100) # Magic number, automatically ignored by CrossEntropyLoss.
                else:
                    label_ids.append(word_labels[word_idx])

            encoding['labels'] = label_ids

        # Otherwise, it does not matter since we are predicting the labels, and we jus need to know the token-id correspondence for label attribution.
        else:
            word_ids2 = [w if w is not None else -1 for w in word_ids]
            encoding['word_ids'] = torch.as_tensor(word_ids2)

        item = {k: torch.as_tensor(v) for k, v in encoding.items()}

        # if self.get_word_ids:
        #     word_ids2 = [w if w is not None else -1 for w in word_ids]
        #     item['word_ids'] = torch.as_tensor(word_ids2)

        return item

  def __len__(self):
        return self.len

### Create training and validation datasets + DataLoaders

We will use a 85%/15% split between training and validation sets.

In [6]:
validation_split_size = 0.15
dataset_split_seed = 33 # Any seed, here for inter-run consistency.

#### Split dataset into training and validation

In [7]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(full_df, test_size=validation_split_size, random_state=dataset_split_seed)

# Drop id column from train dataframe
train_df = train_df[['content', 'labels']]

train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

print("train_df shape: ", train_df.shape)
print("val_df shape: ", val_df.shape)

train_df shape:  (13254, 2)
val_df shape:  (2340, 2)


#### Download/cache all models necessary

In [8]:
if not os.path.exists(config['model_save_path']):
    os.mkdir(config['model_save_path'])

AutoTokenizer.from_pretrained(config['model_name'], add_prefix_space=True, id2label=id2label).save_pretrained(config['model_save_path'])

config_model = AutoConfig.from_pretrained(config['model_name']) 
config_model.num_labels = len(label2id)
config_model.save_pretrained(config['model_save_path'])

AutoModelForTokenClassification.from_pretrained(config['model_name'], 
                                                           config=config_model).save_pretrained(config['model_save_path'])

del config_model

Some weights of BigBirdForTokenClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Initialize Dataset, Dataloader, and tokenizer

In [9]:
train_params = {'batch_size': config['train_batch_size'],
                'shuffle': True,
                # 'num_workers': 2,
                }

validation_params = {'batch_size': config['valid_batch_size'],
               'shuffle': False,
            #    'num_workers': 2,
               }

In [10]:
tokenizer = AutoTokenizer.from_pretrained(config['model_save_path'])

training_set = EssayDataset(df=train_df, tokenizer=tokenizer, max_len=config['max_length'], get_word_ids=False)
training_loader = DataLoader(training_set, **train_params)

testing_set = EssayDataset(df=val_df, tokenizer=tokenizer, max_len=config['max_length'], get_word_ids=True)
validation_loader = DataLoader(testing_set, **validation_params)

In [11]:
training_set[0]['labels'].view(-1)

tensor([-100,    1,    2,  ..., -100, -100, -100])

Save DataLoaders to be able to quickly load them later.

In [12]:
# torch.save(training_loader, 'training_loader.pth')
# torch.save(validation_loader, 'validation_loader.pth')

### Initialize all necesary models/objects

In [13]:
config_model = AutoConfig.from_pretrained(f"{config['model_save_path']}config.json") 
model = AutoModelForTokenClassification.from_pretrained(
                   f"{config['model_save_path']}model.safetensors",config=config_model).to(config['device'])
optimizer = torch.optim.Adam(params=model.parameters(), lr=config['learning_rates'][0])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

### Training loop

In [14]:
from sklearn.metrics import accuracy_score

def train_one_epoch(epoch, verbose=False):
    # Since we're batching, and the last batch may not be the full size,
    # keep track of precise # steps and # seen examples for metrics computations.
    epoch_loss = 0
    epoch_accuracy = 0
    num_train_examples = 0
    num_train_steps = 0
    
    # Set model to training mode (torch).
    model.train()
    
    for idx, batch in tqdm(enumerate(training_loader), total=len(training_loader)): # TODO: Customize tqdm for better displaying.
        
        # The dictionary returned by trainin_loader has 3 keys: input_ids, attention_mask (both made by the Tokenizer),
        # and labels (which we add in the Dataset abstraction, since this is a training run).
        input_ids = batch['input_ids'].to(config['device'], dtype=torch.long)
        attention_mask = batch['attention_mask'].to(config['device'], dtype=torch.long)
        labels = batch['labels'].to(config['device'], dtype=torch.long)

        # Run batch through model.
        loss, train_logits = model(input_ids=input_ids,
                                   attention_mask=attention_mask,
                                   labels=labels,
                                   return_dict=False)
        
        # Increment epoch loss by this batch's loss.
        epoch_loss += loss.item()

        # Increment counters.
        num_train_steps += 1
        num_train_examples += labels.size(0)
        
        # Debugging.
        if verbose and (idx % 100 == 0):
            print(f"Idx: {idx:04d}, step loss: {epoch_loss/num_train_steps}")
           
        # Compute training accuracy
        flattened_logits = train_logits.view(-1, model.num_labels) # (batch_size, sequence_length, num_labels) -> (batch_size * seq_length, num_labels)
        flattened_predictions = torch.argmax(flattened_logits, axis=1) # Find predicted label.
        label_mask = labels.view(-1) != -100 # If our label is -100 (as sdiscussed above), it should be ignored as it is a special token.
        
        # Mask both predictions and ground truth.
        labels = torch.masked_select(input=labels.view(-1), mask=label_mask)
        predictions = torch.masked_select(input=flattened_predictions, mask=label_mask)

        # Use accuracy function from sklear.
        epoch_accuracy += accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
    
        # Gradient clipping.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=config['max_grad_norm'])
        
        # Finally, optimize.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Model and dataset is pretty big, so alleviate some GPU memory usage by releasing unused cached data at every batch.
        torch.cuda.empty_cache()

    # Normalize loss and accuracy by num steps.
    epoch_loss /= num_train_steps
    epoch_accuracy /= num_train_steps

    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {epoch_accuracy}")

#### Finally, train model

In [15]:
import time
from safetensors.torch import save_model

if not os.path.exists(config['model_chkpt_path']):
    os.mkdir(config['model_chkpt_path'])

run_id = time.strftime("%Y%m%d-%H:%M:%S")

for epoch in range(config['epochs']):
    train_one_epoch(epoch)
    torch.cuda.empty_cache()

    chkpt_filename = f"{config['model_chkpt_path']}{config['model_name'].replace('/', '-')}-time={run_id}-maxlen={config['max_length']}-batchsize={config['train_batch_size']}-lr={str(config['learning_rates'])}-maxgrad={config['max_grad_norm']}-epoch={epoch}"

    # torch.save(model.state_dict(), chkpt_filename)
    # save_model(model=model, filename=chkpt_filename, metadata=config_model)
    model.save_pretrained(chkpt_filename)

  0%|          | 0/3314 [00:00<?, ?it/s]

100%|██████████| 3314/3314 [37:39<00:00,  1.47it/s]


Training loss epoch: 0.765188396206338
Training accuracy epoch: 0.7502558406782374


100%|██████████| 3314/3314 [36:58<00:00,  1.49it/s]


Training loss epoch: 0.5831361591240406
Training accuracy epoch: 0.7997284147194624


100%|██████████| 3314/3314 [37:06<00:00,  1.49it/s]


Training loss epoch: 0.4913203148783944
Training accuracy epoch: 0.8287917455815275


100%|██████████| 3314/3314 [37:08<00:00,  1.49it/s]


Training loss epoch: 0.4055954908944248
Training accuracy epoch: 0.8577272066393199


100%|██████████| 3314/3314 [38:09<00:00,  1.45it/s]


Training loss epoch: 0.33581863592817757
Training accuracy epoch: 0.8820413205741073


100%|██████████| 3314/3314 [40:01<00:00,  1.38it/s]


Training loss epoch: 0.26749340302154073
Training accuracy epoch: 0.9059907922725198


100%|██████████| 3314/3314 [37:31<00:00,  1.47it/s]


Training loss epoch: 0.22602375974124392
Training accuracy epoch: 0.9208606919560415


100%|██████████| 3314/3314 [37:30<00:00,  1.47it/s]


Training loss epoch: 0.18980322166148733
Training accuracy epoch: 0.9347972470033142


100%|██████████| 3314/3314 [38:08<00:00,  1.45it/s]


Training loss epoch: 0.16474826169955079
Training accuracy epoch: 0.9436050707252219


100%|██████████| 3314/3314 [37:33<00:00,  1.47it/s]


Training loss epoch: 0.1467643977034956
Training accuracy epoch: 0.950168642721515


In [1]:
torch.cuda.empty_cache()

NameError: name 'torch' is not defined

In [19]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

config = {'model_name': 'google/bigbird-roberta-base', # From Huggingface's ModelHub.
          'model_save_path': './model/',
          'model_chkpt_path': './model/model_chkpt/',
          'max_length': 1024,
          'train_batch_size': 4,
          'valid_batch_size': 4,
          'epochs':10,
          'learning_rates': [2.5e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7],
          'max_grad_norm': 10,
          'device': 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'}

tokenizer = AutoTokenizer.from_pretrained(config['model_save_path'])
model = AutoModelForTokenClassification.from_pretrained('/home/rafaelpiacsek/Documents/College/Courses/Semester_7/CS_410/FinalProject/ArguMentor/notebooks/model/model_chkpt/google-bigbird-roberta-base-time=20231214-16:41:38-maxlen=1024-batchsize=4-lr=[2.5e-05, 2.5e-05, 2.5e-06, 2.5e-06, 2.5e-07]-maxgrad=10-epoch=9').to(config['device'])

In [20]:
def infer(essay):
    # Makes a dict with keys: input_ids, attention_mask.
    encoding = tokenizer(essay.split(),
                         is_split_into_words=True, # Necessary to keep correspondance between words and labels contructed previously.
                         padding='max_length',
                         truncation=True,
                         max_length=config['max_length'])
    """
    From Tokenizer's docs about word_ids:
        A list indicating the word corresponding to each token. Special tokens added by the tokenizer are mapped to None and other tokens
        are mapped to the index of their corresponding word (several tokens will be mapped to the same word index if they are parts of that word).

    This is needed to match the correct labels with the tokens, which may not have a 1:1 correspondence with the original words.
    """
    encoding['word_ids'] = torch.as_tensor([w if w is not None else -1 for w in encoding.word_ids()])

    item = {k: torch.unflatten(torch.as_tensor(v), 0, (1, -1)).to(config['device']) for k, v in encoding.items()}

    model.eval()
    output_dict = model(input_ids=item['input_ids'],
                   attention_mask=item['attention_mask'],
                   return_dict=True)
    # print(torch.argmax(output_dict['logits'].view(-1, model.num_labels), axis=-1).shape)

    token_predictions = torch.argmax(output_dict['logits'].view(-1, model.num_labels), axis=-1)
    
    words_predictions = list()
    prev_word_idx = -1
    for idx, word_idx in enumerate(item['word_ids'][0]):
        if word_idx == -1:
            continue
        elif word_idx != prev_word_idx:
            prev_word_idx = word_idx
            words_predictions.append(token_predictions[idx].item())

    print(len(words_predictions), len(essay.split()))
    
    return words_predictions

In [21]:
test_essay = """
During a group project, have you ever asked a group member about adding or replacing something? Or, when you were studying for a math test, did you ever ask your parents or sibling about different ways to tackle a certain problem? Asking for other's opinions is especially beneficial as it allows for an individual to receive a variety of different views towards a given topic. Likewise, being diverse and asking many people for their opinions allows one to understand how most people percieve something. This is especially important as knowing multiple opinions can allow someone to take those views into account and sway themseleves to the general audience. Knowing different people's opinion can be beneficial in a variety of situations.

First and foremost, a great example about how knowing other's opinions is helpful is when someone is making the choice between smoking or refraining from smoking. A student can watch on a TV channel that smoking is bad, and can damage their internal organs. However, on another channel, the student can find advertisements about the most addicting smoking device that can release the most dopomine in the brain, all the while not severly harming people's lungs. This student will receive a variety of different views and opinions on a certain topic, which allows them to make the best educated choice or decision based on how they interpret what they saw. Similarily, a student can be told from his fellow classmates that smoking is fun, joyful, and makes them happy. However, if the student asks a local doctor, they will be informed differently. A doctor will most likely tell them that smoking, although seeming harmless at first, can lead to serious long term consequences.

If the student asks both his friends and his doctors, he is able to use his judgemental skills to determing which choice will be best for him in the long run.

Furthermore, asking for multiple opinions can benifit during competitions for a position slot, as cadidates needs to make decisions on what they need to say or do. For example, it can be helpful in situations like elections, both for the U.S. or simply in school. If a student is running for a position in office to represent his/her school, he/she can ask a widespread and diverse audience. First, asking other students is their best bet to obtaining information. Other students can inform him/her about what they want, like better water fountains, recess, or healthier food. Then, the student running can make changes to the way they run for the election, and on his/her speech, take a different approach. In addition, if the student running asks an adult, they will get to know a more realistic way the school can be improved. Since a student, even as a student officer, isn't able to make a significant change to a school, they can inform the school board about ways to make the school better. If someone is running for the president of the United States, a similar approach can be taken. First, they can ask the people, on social media or in speeches, about positive ways to reform our country. After the candidate receives the opinion of general audiences, they can campaign differently to match the view of those voting. All in all, asking for the opinion of multiple different people can set the candidate apart from others.

Many people only ask one type of audience for their opinion. Having only one opinion can lead to negative consequences, such as making the wrong choices related to health or education, as only one audience is adressed into making a decision. Therefore, asking multiple different people who have different backgrounds is essential to making the best choices in life. Conclusively, knowing multiple opinions on a certain matter can evidently lead to better results for individuals. 
"""
preds = infer(test_essay)

635 635


In [1]:
from IPython.display import HTML
from math import ceil
import numpy as np

segment_colors = {
  0: 'rgba(0,0,0, 0.0)',
  1: 'rgba(206,95,20, 0.8)',
  2: 'rgba(114,174,146, 0.8)',
  3: 'rgba(251,174,28, 0.8)',
  4: 'rgba(81,53,51, 0.8)',
  5: 'rgba(43,112,133, 0.8)',
  6: 'rgba(200,109,142, 0.8)',
  7: 'rgba(243,218,179, 0.7)',
}

segment_names = {
  0: 'Unnanotated',
  1: 'Lead',
  2: 'Position',
  3: 'Evidence',
  4: 'Claim',
  5: 'Concluding Statement',
  6: 'Counterclaim',
  7: 'Rebuttal'
}

def generate_html_with_highlight(original_text, segment_types):

    # Hover stuff
    html_code = '<style>'
    html_code += '.segment-highlight:hover:before { content: attr(data-label); position: absolute; background: #111; color: #fff; padding: 4px 8px; border-radius: 4px; z-index: 2; font-size: 14px}'
    html_code += '.segment-highlight:hover:after { content: attr(data-label); position: absolute; background: #111; color: #fff; padding: 4px 8px; border-radius: 4px; z-index: 2; font-size: 14px}'
    html_code += '</style>'

    # Generate a legend showing what segment type each color represents
    html_code += '<p style="font-size: 18px; line-height: 1.6;"><b>Legend:</b><br/>'
    for segment_color, segment_name in zip(segment_colors.values(), segment_names.values()):
        html_code += f'<span style="background-color: {segment_color};">{segment_name}</span><br/>'
    html_code += '<br/></p>'

    html_code += '<p style="font-size: 18px; line-height: 1.6;">You can also hover on top of each segment to see their type<br/><br/></p>'

    # Highlight original text
    html_code += '<p style="font-size: 18px; line-height: 1.6; background-color: white; color: black;"><b>Segmented Essay:</b><br/>'
    current_segment_type = None
    for i, (word, segment_type) in enumerate(zip(original_text.split(), segment_types)):
        if i > 0 and segment_type != segment_types[i - 1]:
            html_code += f'</span>'

        if segment_type != current_segment_type:
            html_code += f'<span class="segment-highlight" style="background-color: {segment_colors[segment_type]};" data-label="{segment_names[segment_type]}">'
            current_segment_type = segment_type

        html_code += f'{word} '

    if current_segment_type is not None:
        html_code += '</span>'

    html_code += '</p>'

    return html_code


segments = [ceil(x/2) for x in preds]

html_code = generate_html_with_highlight(test_essay, segments)
HTML(html_code)

NameError: name 'preds' is not defined