# Load the dataset

In [12]:
import pandas as pd
data = pd.read_csv('data/feedback_prize_input.csv')
data.head()

Unnamed: 0,text,cat_label
0,Modern humans today are always on their phone....,Lead
1,They are some really bad consequences when stu...,Position
2,Some certain areas in the United States ban ph...,Evidence
3,"When people have phones, they know about certa...",Evidence
4,Driving is one of the way how to get around. P...,Claim


In [13]:
data['label'] = data.cat_label.astype('category').cat.codes
data.head()

Unnamed: 0,text,cat_label,label
0,Modern humans today are always on their phone....,Lead,4
1,They are some really bad consequences when stu...,Position,5
2,Some certain areas in the United States ban ph...,Evidence,3
3,"When people have phones, they know about certa...",Evidence,3
4,Driving is one of the way how to get around. P...,Claim,0


In [14]:
#number of labels
"""
{ 'Claim': 0,
    'Concluding Statement':1,
    'Counterclaim':2,
    'Evidence':3,
    'Lead':4,
    'Position':5,
    'Rebuttal':6}
"""
sorted(data['label'].unique())


[0, 1, 2, 3, 4, 5, 6]

In [15]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
"""
split train and test set
"""
data.drop('cat_label', axis=1, inplace=True)
train, test = train_test_split(data, test_size=0.2, random_state=42)
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(test)

ds = DatasetDict()

ds['train'] = tds
ds['test'] = vds

In [16]:
ds["test"][0]

{'text': 'There is also te question of what if the computer gets something wrong or has a glitch? what if it accedentily misinterpruts sadness as anger for example. Not only will this cause a plethera of complications, it may also lead to the computer making other mistakes in the process. ',
 'label': 3,
 '__index_level_0__': 53637}

# Preprocess

In [17]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

loading configuration file config.json from cache at /home/fatenghali/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.25.1",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/fatenghali/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file tokenizer.json from cache at /home/fatenghali/.cache/huggingface/hub/model

In [18]:
"""
 a preprocessing function to tokenize text and truncate sequences to be no longer than DistilBERT’s maximum input length
"""
def preprocess_function(data):
    return tokenizer(data['text'], truncation=True)

In [19]:
tokenized_data = ds.map(preprocess_function, batched=True)

100%|██████████████████████████████████████████████████████████████| 116/116 [00:02<00:00, 40.95ba/s]
100%|████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 40.84ba/s]


Use DataCollatorWithPadding to create a batch of examples. It will also dynamically pad your text to the length of the longest element in its batch, so they are a uniform length. While it is possible to pad your text in the tokenizer function by setting padding=True, dynamic padding is more efficient.

In [20]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [25]:
import torch
  
print(f"Is CUDA supported by this system?{torch.cuda.is_available()}")
print(f"CUDA version:{torch.version.cuda}")
  
# Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
print(f"ID of current CUDA device:{torch.cuda.current_device()}")
        
print(f"Name of current CUDA device:{torch.cuda.get_device_name(cuda_id)}")

Is CUDA supported by this system?False
CUDA version:11.6


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [27]:
torch.cuda.is_available()

False

# Train

Load DistilBERT with AutoModelForSequenceClassification along with the number of expected labels

In [10]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=7)

Downloading: 100%|████████████████████████████████████████████████| 268M/268M [00:14<00:00, 17.9MB/s]
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at disti

At this point, only three steps remain:

1. Define your training hyperparameters in TrainingArguments.
2. Pass the training arguments to Trainer along with the model, dataset, tokenizer, and data collator.
3. Call train() to fine-tune your model.

In [11]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 115434
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 36075
  Number of trainable parameters = 66958855
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


KeyboardInterrupt: 