# Model training

In [1]:
from trainer import *

  from .autonotebook import tqdm as notebook_tqdm


## Dataset preparation

In [12]:
from generator import *

add_entries("./prompts", "dataset/output.csv")
clean_entries("dataset/output.csv")
no_tone_csv("dataset/output.csv")
target_to_nb("dataset/no_tone_output.csv")
pd.read_csv("dataset/no_tone_output.csv").sample(20).to_csv("dataset/no_tone_output_test.csv") # manage test set, should be improved
print("Files were created:\n - " +"\n - ".join([file for file in os.listdir("dataset/") if file[-4:] == ".csv"]))

Files were created:
 - output.csv
 - no_tone_output_test.csv
 - no_tone_nbtarget_output.csv
 - no_tone_output.csv


In [13]:
train_texts, train_labels = read_target_split('dataset/no_tone_output.csv')
test_texts, test_labels = read_target_split('dataset/no_tone_output_test.csv')

# Split into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.05)

# Define tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Encode the data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Create instances of HATEDataset (gives all the attributes)
train_dataset = HATEDataset(train_encodings, train_labels)
val_dataset = HATEDataset(val_encodings, val_labels)
test_dataset = HATEDataset(test_encodings, test_labels)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


## Model training

In [4]:
# Model parameters
criterion = nn.CrossEntropyLoss()
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(MAPPING))
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

metrics = {'ACC': acc, 'F1-weighted': f1}

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Train the model
loss, metric = model_training(model, train_dataset, 5, optim, criterion, metrics)

Epoch 1 / 5


100%|██████████| 40/40 [01:35<00:00,  2.39s/it]


train Loss: 0.9998,  ACC: 0.8641, F1-weighted: 0.7589
Epoch 2 / 5


100%|██████████| 40/40 [01:34<00:00,  2.37s/it]


train Loss: 0.3457,  ACC: 0.9688, F1-weighted: 0.9360
Epoch 3 / 5


100%|██████████| 40/40 [01:39<00:00,  2.48s/it]


train Loss: 0.1428,  ACC: 0.9984, F1-weighted: 0.9967
Epoch 4 / 5


100%|██████████| 40/40 [01:34<00:00,  2.35s/it]


train Loss: 0.0769,  ACC: 0.9969, F1-weighted: 0.9937
Epoch 5 / 5


100%|██████████| 40/40 [01:36<00:00,  2.41s/it]

train Loss: 0.0483,  ACC: 0.9975, F1-weighted: 0.9958





In [15]:
# Save the weights
torch.save(model.state_dict(), "model/weights")

# Or load weights
# model.load_state_dict(torch.load("model/weights"))

## Model responses

In [23]:
text = "Kitchen is the place they should be"
inputs = tokenizer(text, return_tensors="pt",  padding = True, truncation = True)
inputs = inputs.to(DEVICE)

with torch.no_grad():
    logits = model(**inputs.to(DEVICE)).logits

predicted_class_id = logits.argmax().item()
print("Prompt: ", text)
print(" - Predicted class id: ", predicted_class_id)
print(" - Predicted category: ", MAPPING_INV[predicted_class_id])

Prompt:  Kitchen is the place they should be
 - Predicted class id:  10
 - Predicted category:  women
