In [6]:
!pip install transformers
# install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
# required libraries to run model

import torch
from transformers import DistilBertTokenizerFast
from transformers import Trainer, TrainingArguments
from transformers import DistilBertForSequenceClassification, AdamW

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import numpy as np
import pandas as pd
import re


In [8]:
# credits to @yashj302 on medium
# https://medium.com/@yashj302/text-cleaning-using-regex-python-f1dded1ac5bd
# function used to clean junk out of language. We don't need numbers in the tokenizer
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'scuse', 'excuse', text)  # lol
    text = re.sub(r'http[s]?\://\S+', "", text)  # remove http://medium.com
    text = re.sub(r'\s+', ' ', text)  # remove 'VERY   EXTRA   SPACE S       '
    text = re.sub(r'[0-9]', "", text)  # remove numbers
    text = re.sub(r'[^\w]', ' ', text)  # remove characters
    text = re.sub(r' +', ' ', text)
    text = text.strip(' ')
    return text


In [5]:
# the base model is this
model_name = "distilbert-base-uncased"

# read the csv and clean the text
df = pd.read_csv('train.csv')
df['comment_text'] = df['comment_text'].apply(clean_text)

# set what the texts and labels are
train_texts = df["comment_text"].values
train_labels = df[df.columns[2:]].values

# split the text and values into training and validation splits
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=.2)


In [9]:
# Since there are multiple labels we need to find and the trainer class only uses binary classifiers,
# we have to write our own Dataset format that takes multiple labels
# it needs an init, getitem, and len function to work correctly with the dataset and training modules
class MultiClass(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    # get item will return the tensor where the value is being found
    def __getitem__(self, idx):
        encodings = tokenizer(
            self.texts[idx], truncation=True, padding="max_length")
        item = {key: torch.tensor(val) for key, val in encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)
        del encodings
        return item

    # return the length of the labels as those are the values needed
    def __len__(self):
        return len(self.labels)


In [10]:

# we initialize the tokenizer from the same model name
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

# call the init function on our texts and labels to store
train_dataset = MultiClass(train_texts, train_labels)
val_dataset = MultiClass(val_texts, val_labels)

# check if we're using cuda enabled gpus
device = torch.device(
    'cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

cuda


In [11]:
# the model needs to be mapped to the device we're using and called. We pass in num_labels to establish
# the length of the MultiClass
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=6, problem_type="multi_label_classification")
model.to(device)
model.train()

# this is the dataloader that takes 32 samples at a time and puts them in a batch to pass into the trainer
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# set the optimizer, basically "better" sgd
optim = AdamW(model.parameters(), lr=5e-5)

# one epoch
for epoch in range(1):
    # for every batch of 32 samples
    for batch in train_loader:
        # forward prop
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # backwards prop
        outputs = model(
            input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        print(loss)
        loss.backward()
        optim.step()

# reset our model back to evaluation mode
model.eval()

# output the model and tokenizer to be used on huggingface
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("new_tokenizer")


Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier

tensor(0.7034, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6715, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6360, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6187, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.5740, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.5272, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.4835, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.4485, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.4398, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.3988, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.4079, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.3177, device='cuda:0',


KeyboardInterrupt: ignored