<a href="https://colab.research.google.com/github/dwjuston/NLP/blob/main/FineTuneWSBLongShortNorec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 2022 Fine Tuning With Custom Datasets


# HF Example Code
https://huggingface.co/transformers/custom_datasets.html

In [None]:
%%capture
!pip install transformers

## Load Training Data

In [None]:
from pathlib import Path
import json

import pandas as pd

In [None]:
#training_fldr = Path('drive/MyDrive/baruch-nlp/math-9796-2022/share-2022-mth-9796/2022-homework-4/hw4-2022-training-data/instructor/')
training_fldr = Path('drive/MyDrive/baruch-nlp/math-9796-2021/MTH9796-Share/Homework-4/training-data/instructor/')
#tagged_file = training_fldr / 'team-0-1-done.csv'
def iter_tagged_files():
  for i in range(1,6):
    yield training_fldr / f"team-0-{i}-done.csv"

### Confirm files are tagged correctly

In [None]:
for fn in iter_tagged_files():
  df = pd.read_csv(fn.open('r'))
  print(fn,sorted(df['label'].unique()))

drive/MyDrive/baruch-nlp/math-9796-2021/MTH9796-Share/Homework-4/training-data/instructor/team-0-1-done.csv ['long', 'norec', 'short']
drive/MyDrive/baruch-nlp/math-9796-2021/MTH9796-Share/Homework-4/training-data/instructor/team-0-2-done.csv ['long', 'norec', 'short']
drive/MyDrive/baruch-nlp/math-9796-2021/MTH9796-Share/Homework-4/training-data/instructor/team-0-3-done.csv ['long', 'norec', 'short']
drive/MyDrive/baruch-nlp/math-9796-2021/MTH9796-Share/Homework-4/training-data/instructor/team-0-4-done.csv ['long', 'norec', 'short']
drive/MyDrive/baruch-nlp/math-9796-2021/MTH9796-Share/Homework-4/training-data/instructor/team-0-5-done.csv ['long', 'norec', 'short']


### Load all tagged data

In [None]:
tagged_data = pd.concat([
                         pd.read_csv(path.open('r'))
                         for path in iter_tagged_files()
])
tagged_data.shape

(1497, 4)

In [None]:
tagged_data.head()

Unnamed: 0.1,Unnamed: 0,fname,text,label
0,0,file-0000.csv,We made it to Vice News you dirty apes ü¶ç,norec
1,1,file-0000.csv,So what happened end of day on the DOW...hell ...,norec
2,2,file-0000.csv,HOFV Mega squeeze Opportunity,long
3,3,file-0000.csv,Miley Cryus giving away $50 of free stock via ...,norec
4,4,file-0000.csv,"Soon , fellow ü¶çü¶çü¶ç",norec


In [None]:
labels = {
    'long': 0,
    'short': 1,
    'norec': 2
}

In [None]:
tagged_texts = tagged_data['text'].tolist()
tagged_labels = list(map(lambda tag:labels[tag], tagged_data['label']))

In [None]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(tagged_texts, tagged_labels, test_size=.2)

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
train_texts[0], type(train_texts)

('what is the best app for stock trading', list)

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
# test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
import torch

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
# test_dataset = IMDbDataset(test_encodings, test_labels)

## Train Model

### Pytorch Trainer

In [None]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# We set num_labels to match our training data
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
model.to(device)
model.train() # Put the model into "training-mode."

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier

In [None]:
for epoch in range(5):
    print("\nEpoch", epoch)
    # import pdb
    # pdb.set_trace()
    for batch in train_loader:
        print(".", end='')
        optim.zero_grad() # Zero-out gradients.
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward() # calculate gradient: dloss/dweights
        optim.step()
print("Done")


Epoch 0
...........................................................................
Epoch 1
...........................................................................
Epoch 2
...........................................................................
Epoch 3
...........................................................................
Epoch 4
...........................................................................Done


In [None]:
_ = model.eval()  # Put the model into "inference-mode"

## Validate

In [None]:
val_loader = DataLoader(val_dataset, batch_size=len(val_dataset), shuffle=True)
print(len(val_dataset))

300


In [None]:
for batch in val_loader:
  input_ids = batch['input_ids'].to(device)
  attention_mask = batch['attention_mask'].to(device)
  labels = batch['labels'].to(device)
  outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
  loss = outputs[0]
  print(loss)

tensor(0.7381, device='cuda:0', grad_fn=<NllLossBackward0>)


### Calculate Accuracy

In [None]:
yhat = [int(outputs['logits'][i,:].argmax()) for i in range(len(outputs['logits']))]
accuracy = pd.DataFrame({
    'y':labels.tolist(),
    'yhat': yhat})
accuracy['correct'] = accuracy['y']==accuracy['yhat']
accuracy.tail()

Unnamed: 0,y,yhat,correct
295,2,2,True
296,2,2,True
297,0,0,True
298,2,2,True
299,2,2,True


In [None]:
accuracy['correct'].sum() / len(accuracy)

0.8233333333333334

### Calculate Confusion Matrix

**Exercise for the student...**

### Calculate Additional Metrics

**Exercise for the student:**

* Precision;
* Recall;
* F1

### Spot Check

In [None]:
outputs['logits'][1], val_texts[1]

In [None]:
outputs['logits'][9], val_texts[9]

In [None]:
outputs['logits'][15], val_texts[15]