In [1]:
# !pip install -q datasets
# !pip install transformers[sentencepiece]
# !pip install nb_black

In [2]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [3]:
import pandas as pd
from pathlib import Path

path = Path("/home/ec2-user/SageMaker/")
df = pd.read_csv(path / "df_raw.csv")
df.dropna(inplace=True)

<IPython.core.display.Javascript object>

In [4]:
df.describe(include="object")

Unnamed: 0,gut_codes,transcript_text
count,6268,6268
unique,505,2927
top,K12.MA-MA7-RPS-A.03,"Explore the Relationship Among Fractions, Deci..."
freq,82,60


<IPython.core.display.Javascript object>

In [5]:
gutcode_to_code = {val: idx for idx, val in enumerate(pd.unique(df.gut_codes))}

<IPython.core.display.Javascript object>

In [6]:
df["labels"] = df["gut_codes"].map(lambda x: gutcode_to_code[x])

<IPython.core.display.Javascript object>

In [7]:
# df = df.sample(1000)

<IPython.core.display.Javascript object>

In [8]:
df

Unnamed: 0.1,Unnamed: 0,gut_codes,transcript_text,labels
0,0,K12.MA-MA6-RPS-A.02,Ratio Worksheets | Simple Ratio Worksheets Thi...,0
1,1,K12.MA-A1-A-REI.C.01,Systems of Linear Equations: Solving by Additi...,1
2,2,K12.MA-A1-F-IF.B.01,"Increasing and Decreasing Functions math, math...",2
3,3,K12.MA-A2-F-IF.B.01,"Increasing and Decreasing Functions math, math...",3
4,4,K12.MA-A1-F-LE.A.01.03,"Increasing and Decreasing Functions math, math...",4
...,...,...,...,...
8472,8472,K12.MA-A1-A-SSE.B.01.03,Illustrative Mathematics Providing instruction...,26
8473,8473,K12.MA-A1-A-REI.B.02.02,3 Ways to Solve Quadratic Equations - wikiHow ...,85
8474,8474,K12.MA-GEO-SP-CP.A.02,Illustrative Mathematics Providing instruction...,115
8475,8475,K12.MA-GEO-SP-CP.A.03,Illustrative Mathematics Providing instruction...,113


<IPython.core.display.Javascript object>

In [9]:
df["text"] = "TEXT1: " + df.transcript_text

<IPython.core.display.Javascript object>

In [10]:
from datasets import Dataset, DatasetDict

dataset = Dataset.from_pandas(df)

<IPython.core.display.Javascript object>

In [11]:
dataset

Dataset({
    features: ['Unnamed: 0', 'gut_codes', 'transcript_text', 'labels', 'text', '__index_level_0__'],
    num_rows: 6268
})

<IPython.core.display.Javascript object>

In [12]:
dataset = dataset.remove_columns(["Unnamed: 0", "transcript_text", "__index_level_0__"])

<IPython.core.display.Javascript object>

In [13]:
dataset

Dataset({
    features: ['gut_codes', 'labels', 'text'],
    num_rows: 6268
})

<IPython.core.display.Javascript object>

In [14]:
dataset = dataset.train_test_split(0.10, seed=42)

<IPython.core.display.Javascript object>

In [15]:
dataset

DatasetDict({
    train: Dataset({
        features: ['gut_codes', 'labels', 'text'],
        num_rows: 5641
    })
    test: Dataset({
        features: ['gut_codes', 'labels', 'text'],
        num_rows: 627
    })
})

<IPython.core.display.Javascript object>

In [16]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

<IPython.core.display.Javascript object>

In [17]:
train_encodings = tokenizer(
    dataset["train"]["text"],
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt",
)
val_encodings = tokenizer(
    dataset["test"]["text"],
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt",
)

<IPython.core.display.Javascript object>

In [18]:
import torch


class BankingDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

<IPython.core.display.Javascript object>

In [19]:
train_dataset = BankingDataset(train_encodings, dataset["train"]["labels"])
val_dataset = BankingDataset(val_encodings, dataset["test"]["labels"])
# test_dataset = IMDbDataset(test_encodings, test_labels)

<IPython.core.display.Javascript object>

In [20]:
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

<IPython.core.display.Javascript object>

In [21]:
bs = 16
epochs = 4
lr = 8e-5

<IPython.core.display.Javascript object>

In [22]:
args = TrainingArguments(
    "outputs",
    learning_rate=lr,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    fp16=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs * 2,
    num_train_epochs=epochs,
    weight_decay=0.01,
    report_to="none",
)

<IPython.core.display.Javascript object>

In [23]:
def compute_metrics(eval_pred):
    import numpy as np
    from datasets import load_metric

    metric1 = load_metric("precision")
    metric2 = load_metric("recall")
    metric3 = load_metric("f1")
    metric4 = load_metric("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision = metric1.compute(
        predictions=predictions, references=labels, average="micro"
    )["precision"]
    recall = metric2.compute(
        predictions=predictions, references=labels, average="micro"
    )["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="micro")[
        "f1"
    ]
    accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]

    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}

<IPython.core.display.Javascript object>

In [24]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=505
)
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# model = AutoModelForSequenceClassification.from_pretrained(
#     "distilbert-base-uncased", num_labels=505
# )
# trainer = Trainer(
#     model,
#     args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     tokenizer=tokenizer,
# )

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

<IPython.core.display.Javascript object>

In [25]:
trainer.train()

***** Running training *****
  Num examples = 5641
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1412
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,5.445369,0.039872,0.039872,0.039872,0.039872
2,5.716600,4.626223,0.07496,0.07496,0.07496,0.07496
3,4.386700,4.218055,0.105263,0.105263,0.105263,0.105263
4,4.386700,4.148781,0.116427,0.116427,0.116427,0.116427


***** Running Evaluation *****
  Num examples = 627
  Batch size = 32
  metric1 = load_metric("precision")
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-500/special_tokens_map.json
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 627
  Batch size = 32
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to outputs/checkpoint-1000
Configuration saved in outputs/checkpoint-1000/config.json
Model weights saved in outputs/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1000/tokenizer_config.json
Special token

TrainOutput(global_step=1412, training_loss=4.686880657422981, metrics={'train_runtime': 440.6226, 'train_samples_per_second': 51.209, 'train_steps_per_second': 3.205, 'total_flos': 3015806532464640.0, 'train_loss': 4.686880657422981, 'epoch': 4.0})

<IPython.core.display.Javascript object>

In [30]:
import numpy as np

preds = trainer.predict(val_dataset).predictions.astype(float)
predictions = np.argmax(preds, axis=-1)
actual = (val_dataset).labels
y_pred = predictions

***** Running Prediction *****
  Num examples = 627
  Batch size = 32
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


<IPython.core.display.Javascript object>

In [31]:
def check_top_5(idx):
    return True if actual[idx] in preds[idx].argsort()[-5:][::-1] else False

<IPython.core.display.Javascript object>

In [32]:
true_count = 0
false_count = 0

for i in range(0, len(y_pred)):

    if check_top_5(i):
        true_count += 1
    else:
        false_count += 1

print(true_count, false_count)

230 397


<IPython.core.display.Javascript object>

In [34]:
230 / (230 + 397) * 100

36.68261562998405

<IPython.core.display.Javascript object>

In [35]:
def check_top_10(idx):
    return True if actual[idx] in preds[idx].argsort()[-10:][::-1] else False

<IPython.core.display.Javascript object>

In [36]:
true_count = 0
false_count = 0

for i in range(0, len(y_pred)):

    if check_top_10(i):
        true_count += 1
    else:
        false_count += 1

print(true_count, false_count)

319 308


<IPython.core.display.Javascript object>

In [38]:
319 / (319 + 308) * 100

50.877192982456144

<IPython.core.display.Javascript object>