In [14]:
from datasets import Dataset
from datasets import DatasetDict
from torch.utils.data import DataLoader
from tqdm import tqdm 

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import numpy as np

In [2]:
# Read in the data
ds = Dataset.from_csv('data/mental_health.csv')
# Select the amount of data to drop
to_keep = 0.0
if to_keep:
    ds = ds.train_test_split(test_size=to_keep)['train']
# Split the data into train, test, and validation
all_parts = ds.train_test_split(test_size=0.2,shuffle=True)
test_valid = all_parts['test'].train_test_split(test_size=0.5)
data = DatasetDict({
    'train': all_parts['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

Found cached dataset csv (C:/Users/drago/.cache/huggingface/datasets/csv/default-e9382578803c537b/0.0.0)


In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=2).to(device)

freeze_weights = True

if freeze_weights:
    for name, param in model.named_parameters():
        if 'classifier' not in name:
            param.requires_grad = False

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.w

In [11]:
# Tokenize the data
def tokenize(batch):
    return tokenizer(batch['text'],padding=True,truncation=True)

data_tokenized = data.map(tokenize,batched=True)

# Remove the columns we don't need
new_dataset=data_tokenized.remove_columns('text')
new_dataset=new_dataset.rename_column('label','labels')
new_dataset.set_format("torch")


Loading cached processed dataset at C:\Users\drago\.cache\huggingface\datasets\csv\default-e9382578803c537b\0.0.0\cache-cf49302386046d32.arrow
Loading cached processed dataset at C:\Users\drago\.cache\huggingface\datasets\csv\default-e9382578803c537b\0.0.0\cache-937c65105e3adce5.arrow


In [12]:
# Create the dataloader to specify how the data is batched and passed into the model for training
train_loader = DataLoader(new_dataset['train'], batch_size=16, shuffle=True)
val_loader = DataLoader(new_dataset['valid'], batch_size=16)

# Create the optimizer, which specifies the parameters to update and how to update them
optim = AdamW(model.parameters(), lr=5e-5)



In [13]:
lowest_loss = np.inf
# One epoch is one pass through the entire dataset
for epoch in range(3):
    model.train()
    # Iterate over the batches of data. The dataloader will return the batches
    for batch in tqdm(train_loader, leave=False):
        # Clear the gradients from the previous iteration
        optim.zero_grad()
        # Move the batch of inputs to the GPU/CPU
        input_ids = batch['input_ids'].to(device)
        # the attention mask is used to ignore the padding tokens, i.e. tokens that are not part of the sentence
        attention_mask = batch['attention_mask'].to(device)
        # Move the labels to the GPU/CPU
        labels = batch['labels'].to(device)
        # Forward pass the inputs through the model
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        # Get the loss from the first element of the outputs tuple
        loss = outputs[0]
        # Backpropagate the loss to update the model's parameters
        loss.backward()
        # Update the parameters
        optim.step()

    # Print the training and validation loss
    model.eval()  # handle drop-out/batch norm layers
    val_loss = 0
    train_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, leave=False):
            input_ids = batch['input_ids'].to(device)
            # the attention mask is used to ignore the padding tokens, i.e. tokens that are not part of the sentence
            attention_mask = batch['attention_mask'].to(device)
            # Move the labels to the GPU/CPU
            labels = batch['labels'].to(device)
            # Forward pass the inputs through the model
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            # Get the loss from the first element of the outputs tuple
            val_loss += outputs[0]
        # total loss - divide by number of batches
        val_loss = loss / len(val_loader)
    
    if val_loss < lowest_loss:
        lowest_loss = val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optim.state_dict(),
            'val_loss': lowest_loss,
            }, "model.pt")
    
    print("Epoch: {}, Val Loss: {}".format(epoch+1, val_loss))

                                                   

Epoch: 1, Train Loss: 0.0002048851310973987, Val Loss: 0.001637910259887576


                                                   

Epoch: 2, Train Loss: 0.0003730198659468442, Val Loss: 0.002982027130201459


                                                   

Epoch: 3, Train Loss: 0.0001945940311998129, Val Loss: 0.0015556401340290904


In [15]:
# Set the model to evaluation mode. So we don't update the weights
model.eval()
predictions = np.array([])

# For simplicity we'll evaluate passing all the samples at once
test_loader = DataLoader(new_dataset['test'], batch_size=100, shuffle=False)

# Get the scores on the test set
with torch.no_grad():
    # Pass through training set
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        # Get the logits from the model
        outputs = model(input_ids, attention_mask=attention_mask)
        # Get the predictions by taking the argmax of the logits
        predictions = np.append(predictions, torch.argmax(outputs[0], dim=1).tolist())


# Print accuracy
test_label = new_dataset['test']['labels'].tolist()
acc = accuracy_score(test_label, predictions.tolist())
f1 = f1_score(test_label, predictions.tolist())
prec = precision_score(test_label, predictions.tolist())
rec = recall_score(test_label, predictions.tolist())

print("Accuracy: {}, F1: {}, Precision: {}, Recall: {}".format(acc, f1, prec, rec))


Accuracy: 0.8934953538241601, F1: 0.8934953538241601, Precision: 0.8668515950069348, Recall: 0.9218289085545722
