<a href="https://colab.research.google.com/github/dibsondivya/ai-health-event/blob/main/Divya_Finetuning_ERNIE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Source
[Fine Tuning](https://medium.com/nlplanet/fine-tuning-distilbert-on-senator-tweets-a6f2425ca50e)


## Data and Package Installation
Manually import data files

In [None]:
import pandas as pd
df = pd.read_csv('train_textcleaned.csv', index_col=0)
df_test = pd.read_csv('test_textcleaned.csv', index_col=0)

In [None]:
!pip install datasets --quiet

[K     |████████████████████████████████| 362 kB 21.8 MB/s 
[K     |████████████████████████████████| 140 kB 56.2 MB/s 
[K     |████████████████████████████████| 212 kB 55.8 MB/s 
[K     |████████████████████████████████| 1.1 MB 49.5 MB/s 
[K     |████████████████████████████████| 127 kB 46.3 MB/s 
[K     |████████████████████████████████| 271 kB 55.7 MB/s 
[K     |████████████████████████████████| 144 kB 47.0 MB/s 
[K     |████████████████████████████████| 94 kB 2.7 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
[?25h

## Prepare datasets

In [None]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Put clean data in a dataset split into train and test sets
dataset = Dataset.from_pandas(df).train_test_split(train_size=0.8, seed=123)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['tweet', 'label', '__index_level_0__'],
        num_rows: 2540
    })
    test: Dataset({
        features: ['tweet', 'label', '__index_level_0__'],
        num_rows: 635
    })
})


In [None]:
testdataset = Dataset.from_pandas(df_test)
print(testdataset)

Dataset({
    features: ['tweet', 'label', '__index_level_0__'],
    num_rows: 1361
})


## Tokenize data

In [None]:
!pip install transformers --quiet
from transformers import AutoTokenizer

# Instantiate ERNIE tokenizer...
tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-en")

Downloading:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# Tokenize and encode the dataset
def tokenize(batch):
    tokenized_batch = tokenizer(batch['tweet'], padding=True, truncation=True, max_length=128)
    return tokenized_batch

dataset_enc = dataset.map(tokenize, batched=True, num_proc=4)

# Set dataset format for PyTorch
dataset_enc.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Check the output
print(dataset_enc["train"].column_names)



      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

['tweet', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask']


## Prepare dataloaders

In [None]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

# Instantiate a data collator with dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create data loaders for to reshape data for PyTorch model
train_dataloader = DataLoader(
    dataset_enc["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    dataset_enc["test"], batch_size=8, collate_fn=data_collator
)

## Load Model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Set number of class labels based on dataset since 0,1,2,3
num_labels = 4
print(f"Number of labels: {num_labels}")

# Load model from checkpoint
model = AutoModelForSequenceClassification.from_pretrained("nghuyong/ernie-2.0-en", num_labels=num_labels)

Number of labels: 4


Some weights of the model checkpoint at nghuyong/ernie-2.0-en were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-en a

In [None]:
from transformers import AdamW
from transformers import get_scheduler

# Model parameters
learning_rate = 5e-5
num_epochs = 5

# Create the optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Further define learning rate scheduler
num_training_batches = len(train_dataloader)
num_training_steps = num_epochs * num_training_batches
lr_scheduler = get_scheduler(
    "linear",                   # linear decay
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)



## Device settings

In [None]:
# Set the device automatically (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Move model to device
model.to(device)

cpu


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(4, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, element

## Train Model

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

# Train the model with PyTorch training loop
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/1590 [00:00<?, ?it/s]

## Save model

In [None]:
# Save model to disk
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define root dir in Google Drive
root_dir = "/content/drive/MyDrive/colab_data"

model.save_pretrained(f"{root_dir}/models/ernie-ai-health")

Mounted at /content/drive


## Validate model

In [None]:
from datasets import load_metric

# Load metric
metric = load_metric("accuracy","f1")

# Iteratively evaluate the model and compute metrics
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

# Get model accuracy and F1 score
metric.compute()

{'accuracy': 0.8881889763779528}

## Prepare test set

In [None]:
testdataset_enc = testdataset.map(tokenize, batched=True, num_proc=4)

# Set dataset format for PyTorch
testdataset_enc.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataloader = DataLoader(
    testdataset_enc, batch_size=8, collate_fn=data_collator
)

       

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

## Evaluate with test set

In [None]:
# Iteratively evaluate the model and compute metrics
model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

# Get model accuracy and F1 score
metric.compute()

{'accuracy': 0.8743570903747244}

## Save model

In [30]:
from google.colab import files 

torch.save(model.state_dict(), 'ernie.pth') # saves model parameters

# download ernie file
files.download('ernie.pth')

## to load
#the_model = TheModelClass(*args, **kwargs)
#the_model.load_state_dict(torch.load(PATH))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
torch.save(model, 'erniemodel.pth')
files.download('erniemodel.pth')

## to load
# model = torch.load(PATH)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>