Finetune encoder with classifier head:
*     roberta-large
Finetune decoder with lm head:
*     distilgpt2
Finetune encode-decoder with seq2seqlm head (using lora)
*     flan-t5-large
Train classifier on top of embeddings:
*     gpt2-xl (decoder)
*     roberta-large (encoder)
*     openai-ada (decoder; calling api)
Finetune using OpenAI API
*     curie
Ask instruction-tuned LM with no fine tuning
*     PaLM


In [1]:
#basic setup
!pip install --upgrade transformers  
!pip install --upgrade accelerate
!pip install datasets
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification #classification head atop base model
from datasets import Dataset,DatasetDict
from transformers import DataCollatorWithPadding
from transformers import create_optimizer
from transformers import TrainingArguments, Trainer
import torch
from torch.utils.data import DataLoader
import re
import os
import gc

def load_data(sep="[SEP]", aug=True, dset='train'):
    if dset=='train':
        df = pd.read_csv("/kaggle/input/nlpgs-train-cln/train_cln.csv")
    else:
        df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
    #add keyword and location
    if aug:
        df['keyword'] = df['keyword'].fillna('unknown')
        df['location'] = df['location'].fillna('unknown')
        df['text'] = df.apply(lambda row: f"{row['text']} {sep} keyword: {row['keyword']} {sep} location: {row['location']}", axis=1)
    return df

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.30.1
    Uninstalling transformers-4.30.1:
      Successfully uninstalled transformers-4.30.1
Successfully installed transformers-4.30.2
Collecting accelerate
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.12.0
    Uninstalling accelerate-0.12.0:
      Successfully uninstalled accelerate-0.12.0
Successfully installed accelerate-0.20.3
[0m

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [5]:
#This finetunes encoders with a classification head (using pytorch)
model_name="roberta-large"#"gpt2-medium"#"cardiffnlp/twitter-roberta-base"#"roberta-base"#"distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
def tok_func(x): return tokenizer(x["text"], padding=True)

train_df = load_data(aug=True, dset='train')
ds = Dataset.from_pandas(train_df)
ds = ds.train_test_split(test_size=0.05, seed=42)
tok_ds = ds.map(tok_func, batched=True, remove_columns=('keyword','id','location','text'))
tok_ds = tok_ds.rename_columns({'target':'label'})

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device);

training_args = TrainingArguments(
    output_dir = "test",
    overwrite_output_dir=True,
    report_to='none',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_ds["train"],
    eval_dataset=tok_ds["test"],
    data_collator=data_collator,
)
 
trainer.train()

###try it on validation set
prediction_output = trainer.predict(tok_ds["test"])

# getting predictions and converting to classes
predictions = prediction_output.predictions
predicted_classes = np.argmax(predictions, axis=1)

# Prepare the data as a pandas DataFrame
data = {"id": ds["test"]["id"], "input_text": ds["test"]["text"], "predicted_label": predicted_classes, "true_label": ds["test"]["target"]}
output_df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
output_df.to_csv(model_name.rsplit('/', 1)[-1]+'_aug_finetuned_classhead_cln_eval_short.csv', index=False)

####now on test set
test_df = load_data(aug=True, dset='test')
ds = Dataset.from_pandas(test_df)
tok_ds = ds.map(tok_func, batched=True, remove_columns=('keyword','id','location','text'))
prediction_output = trainer.predict(tok_ds)
predictions = prediction_output.predictions
predicted_classes = np.argmax(predictions, axis=1)
data = {"id": ds["id"], "input_text": ds["text"], "predicted_label": predicted_classes}
output_df = pd.DataFrame(data)
output_df.to_csv(model_name.rsplit('/', 1)[-1]+'_aug_finetuned_classhead_cln_test.csv', index=False)


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should 

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,0.262497
2,0.390100,0.240739
3,0.223900,0.314278


  0%|          | 0/4 [00:00<?, ?ba/s]

In [6]:
#Causal lm (decoder) fine-tuning
from transformers import AutoModelForCausalLM, DataCollatorForLanguageModeling, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
from tqdm import tqdm
model_name="distilgpt2"#"gpt2-xl"#"gpt2-medium"#

#load model
model = AutoModelForCausalLM.from_pretrained(model_name)

#load and set up tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side = 'left')
if tokenizer.sep_token is None:
    sep = '[SEP]'
    tokenizer.add_special_tokens({'sep_token': sep})
    model.resize_token_embeddings(len(tokenizer))
else:
    sep = tokenizer.sep_token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

#load and split the data
train_df = load_data(sep,aug=True,dset='train')
ds = Dataset.from_pandas(train_df)
ds = ds.train_test_split(test_size=0.05, seed=42)

#append labels to training input
def format_text(example, train):
    if train:
        label = "Real disaster" if example['target'] == 1 else "Not a real disaster"
    else:
        label =""
    return {'text': f"Tweet: {example['text']} Label: {label}"}
ds["train"] = ds["train"].map(lambda example: format_text(example, train=True))
ds["test"] = ds["test"].map(lambda example: format_text(example, train=False))

#tokenize the data
def tok_func(x): return tokenizer(x["text"])
tok_ds = ds.map(tok_func)
tok_ds = tok_ds.remove_columns(ds['train'].column_names)

#set params
batch_size = 8
learning_rate=3e-4#2e-5
num_train_epochs=5
weight_decay=0.01

#set up the data collator and loaders
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
train_dataloader = DataLoader(tok_ds['train'], shuffle=True, collate_fn=data_collator, batch_size=batch_size, pin_memory=True)
eval_dataloader = DataLoader(tok_ds['test'], collate_fn=data_collator, batch_size=batch_size, pin_memory=True)

#load optimizer and lr_scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_train_epochs),
)

#do training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference! 
for epoch in range(0,num_train_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"epoch: {epoch}, train_ppl: {train_ppl}, train_epoch_loss: {train_epoch_loss}")

#do eval
model.config.use_cache = True 
def label_match(input_str):
    match = re.search(" label\s*:\s*.*", input_str.lower())  
    if match is not None:
        label_part = match.group()
        if "label: not a real disaster" in label_part:
            return 0
        elif "label: real disaster" in label_part:
            return 1
    return None  

###run it on validation set
model.eval()
decoded_outputs = []
for i, batch in enumerate(tqdm(eval_dataloader)):
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        outputs = model.generate(**batch, pad_token_id=tokenizer.pad_token_id, max_new_tokens=20)
    
    outtexts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_outputs.extend([label_match(outtext) for outtext in outtexts])

# prepare the data as a pandas DataFrame and output to csv
data = {"id": ds["test"]["id"], "input_text": ds["test"]["text"], "predicted_label": decoded_outputs, "true_label": ds["test"]["target"]}
output_df = pd.DataFrame(data)
output_df.to_csv(model_name.rsplit('/', 1)[-1]+'_aug_finetuned_cln_eval_short.csv', index=False)

###now do test
test_df = load_data(sep,aug=True,dset='test')
ds = Dataset.from_pandas(test_df)
ds = ds.map(lambda example: format_text(example, train=False))
tok_ds = ds.map(tok_func, batched=True, remove_columns=('keyword','id','location','text'))
test_dataloader = DataLoader(tok_ds, collate_fn=data_collator, batch_size=batch_size, pin_memory=True)
decoded_outputs = []
for i, batch in enumerate(tqdm(test_dataloader)):
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        outputs = model.generate(**batch, pad_token_id=tokenizer.pad_token_id, max_new_tokens=20)
    
    outtexts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_outputs.extend([label_match(outtext) for outtext in outtexts])

data = {"id": ds["id"], "input_text": ds["text"], "predicted_label": decoded_outputs}
output_df = pd.DataFrame(data)
output_df.to_csv(model_name.rsplit('/', 1)[-1]+'_aug_finetuned_cln_test.csv', index=False)

Downloading (…)lve/main/config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Using sep_token, but it is not set yet.
Using pad_token, but it is not set yet.


  0%|          | 0/6593 [00:00<?, ?ex/s]

  0%|          | 0/347 [00:00<?, ?ex/s]

  0%|          | 0/6593 [00:00<?, ?ex/s]

  0%|          | 0/347 [00:00<?, ?ex/s]

  0%|          | 0/825 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 825/825 [00:49<00:00, 16.82it/s]


epoch: 0, train_ppl: 33.99455261230469, train_epoch_loss: 3.526200294494629


100%|██████████| 825/825 [00:48<00:00, 16.84it/s]


epoch: 1, train_ppl: 12.650788307189941, train_epoch_loss: 2.537719488143921


100%|██████████| 825/825 [00:49<00:00, 16.82it/s]


epoch: 2, train_ppl: 8.9413423538208, train_epoch_loss: 2.190685749053955


100%|██████████| 825/825 [00:48<00:00, 16.84it/s]


epoch: 3, train_ppl: 6.6375017166137695, train_epoch_loss: 1.8927356004714966


100%|██████████| 825/825 [00:49<00:00, 16.83it/s]


epoch: 4, train_ppl: 5.212894916534424, train_epoch_loss: 1.6511353254318237


100%|██████████| 44/44 [00:08<00:00,  5.07it/s]


  0%|          | 0/3263 [00:00<?, ?ex/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

100%|██████████| 408/408 [00:58<00:00,  7.01it/s]


In [2]:
#Embeddings + logreg classifier 
def get_model_embeddings(texts, model, tokenizer, batch_size=8, encoder=False):
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    dataset = torch.utils.data.TensorDataset(inputs['input_ids'], inputs['attention_mask'])
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()  # Put the model in eval mode

    embeddings = []
    with torch.no_grad():
        for i, (input_ids, attention_mask) in enumerate(data_loader):
            print("i=",i)
            # Feed our sequences to the model
            outputs = model(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device))
            
            mean_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            if encoder:
                cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            else:
                cls_embeddings = outputs.last_hidden_state[:, -1, :].cpu().numpy()

            # Store both mean and last token embeddings
            embeddings.extend(list(zip(mean_embeddings, cls_embeddings)))

    return embeddings

def ClassifyEmbeddings(targets, embeddings_train, embeddings_eval, embeddings_test=None):
    from sklearn.linear_model import LogisticRegression

    clf = LogisticRegression(max_iter=1000)

    clf.fit(embeddings_train, np.array(targets))

    y_val_pred = clf.predict(embeddings_eval)
    
    if embeddings_test is not None:
        y_test_pred = clf.predict(embeddings_test)
    else:
        y_test_pred = None
    return y_val_pred, y_test_pred

#load and split the data
train_df = load_data(aug=True,dset='train')
ds = Dataset.from_pandas(train_df)
ds = ds.train_test_split(test_size=0.05, seed=42)
test_df = load_data(aug=True,dset='test')
test_ds = Dataset.from_pandas(test_df)

model_name="roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
embeddings = {}
embeddings['train'] = get_model_embeddings(ds['train']['text'], model, tokenizer, batch_size=16, encoder=True)
embeddings['eval'] = get_model_embeddings(ds['test']['text'], model, tokenizer, batch_size=16, encoder=True)
embeddings['test'] = get_model_embeddings(test_ds['text'], model, tokenizer, batch_size=16, encoder=True)
mean_embeddings_train, cls_embeddings_train = zip(*embeddings['train'])
mean_embeddings_eval, cls_embeddings_eval = zip(*embeddings['eval'])
mean_embeddings_test, cls_embeddings_test = zip(*embeddings['test'])
y_val_pred, y_test_pred = ClassifyEmbeddings(ds["train"]["target"], cls_embeddings_train, cls_embeddings_eval, cls_embeddings_test)
output_df = pd.DataFrame({'id': ds['test']['id'],'input_text': ds['test']["text"],'predicted_label': y_val_pred,
        'true_label': ds["test"]["target"]})
output_df.to_csv(model_name.rsplit('/', 1)[-1]+"_embeddings_aug_cls_logreg_cln_eval_short.csv", index=False)
output_df = pd.DataFrame({'id': test_ds['id'],'input_text': test_ds["text"],'predicted_label': y_test_pred})
output_df.to_csv(model_name.rsplit('/', 1)[-1]+"_embeddings_aug_cls_logreg_cln_test.csv", index=False)

model = None
tokenizer=None
gc.collect()
torch.cuda.empty_cache()

model_name="gpt2-xl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, device_map='auto', torch_dtype=torch.float16)
embeddings = {}
embeddings['train'] = get_model_embeddings(ds['train']['text'], model, tokenizer, batch_size=16, encoder=False)
embeddings['eval'] = get_model_embeddings(ds['test']['text'], model, tokenizer, batch_size=16, encoder=False)
embeddings['test'] = get_model_embeddings(test_ds['text'], model, tokenizer, batch_size=16, encoder=False)
mean_embeddings_train, cls_embeddings_train = zip(*embeddings['train'])
mean_embeddings_eval, cls_embeddings_eval = zip(*embeddings['eval'])
mean_embeddings_test, cls_embeddings_test = zip(*embeddings['test'])
y_val_pred, y_test_pred = ClassifyEmbeddings(ds["train"]["target"], cls_embeddings_train, cls_embeddings_eval, cls_embeddings_test)
output_df = pd.DataFrame({'id': ds['test']['id'],'input_text': ds['test']["text"],'predicted_label': y_val_pred,
        'true_label': ds["test"]["target"]})
output_df.to_csv(model_name.rsplit('/', 1)[-1]+"_embeddings_aug_last_logreg_cln_eval_short.csv", index=False)
output_df = pd.DataFrame({'id': test_ds['id'],'input_text': test_ds["text"],'predicted_label': y_test_pred})
output_df.to_csv(model_name.rsplit('/', 1)[-1]+"_embeddings_aug_last_logreg_cln_test.csv", index=False)



Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


i= 0
i= 1
i= 2
i= 3
i= 4
i= 5
i= 6
i= 7
i= 8
i= 9
i= 10
i= 11
i= 12
i= 13
i= 14
i= 15
i= 16
i= 17
i= 18
i= 19
i= 20
i= 21
i= 22
i= 23
i= 24
i= 25
i= 26
i= 27
i= 28
i= 29
i= 30
i= 31
i= 32
i= 33
i= 34
i= 35
i= 36
i= 37
i= 38
i= 39
i= 40
i= 41
i= 42
i= 43
i= 44
i= 45
i= 46
i= 47
i= 48
i= 49
i= 50
i= 51
i= 52
i= 53
i= 54
i= 55
i= 56
i= 57
i= 58
i= 59
i= 60
i= 61
i= 62
i= 63
i= 64
i= 65
i= 66
i= 67
i= 68
i= 69
i= 70
i= 71
i= 72
i= 73
i= 74
i= 75
i= 76
i= 77
i= 78
i= 79
i= 80
i= 81
i= 82
i= 83
i= 84
i= 85
i= 86
i= 87
i= 88
i= 89
i= 90
i= 91
i= 92
i= 93
i= 94
i= 95
i= 96
i= 97
i= 98
i= 99
i= 100
i= 101
i= 102
i= 103
i= 104
i= 105
i= 106
i= 107
i= 108
i= 109
i= 110
i= 111
i= 112
i= 113
i= 114
i= 115
i= 116
i= 117
i= 118
i= 119
i= 120
i= 121
i= 122
i= 123
i= 124
i= 125
i= 126
i= 127
i= 128
i= 129
i= 130
i= 131
i= 132
i= 133
i= 134
i= 135
i= 136
i= 137
i= 138
i= 139
i= 140
i= 141
i= 142
i= 143
i= 144
i= 145
i= 146
i= 147
i= 148
i= 149
i= 150
i= 151
i= 152
i= 153
i= 154
i= 155
i= 156
i= 157
i= 1

Downloading (…)lve/main/config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


i= 0
i= 1
i= 2
i= 3
i= 4
i= 5
i= 6
i= 7
i= 8
i= 9
i= 10
i= 11
i= 12
i= 13
i= 14
i= 15
i= 16
i= 17
i= 18
i= 19
i= 20
i= 21
i= 22
i= 23
i= 24
i= 25
i= 26
i= 27
i= 28
i= 29
i= 30
i= 31
i= 32
i= 33
i= 34
i= 35
i= 36
i= 37
i= 38
i= 39
i= 40
i= 41
i= 42
i= 43
i= 44
i= 45
i= 46
i= 47
i= 48
i= 49
i= 50
i= 51
i= 52
i= 53
i= 54
i= 55
i= 56
i= 57
i= 58
i= 59
i= 60
i= 61
i= 62
i= 63
i= 64
i= 65
i= 66
i= 67
i= 68
i= 69
i= 70
i= 71
i= 72
i= 73
i= 74
i= 75
i= 76
i= 77
i= 78
i= 79
i= 80
i= 81
i= 82
i= 83
i= 84
i= 85
i= 86
i= 87
i= 88
i= 89
i= 90
i= 91
i= 92
i= 93
i= 94
i= 95
i= 96
i= 97
i= 98
i= 99
i= 100
i= 101
i= 102
i= 103
i= 104
i= 105
i= 106
i= 107
i= 108
i= 109
i= 110
i= 111
i= 112
i= 113
i= 114
i= 115
i= 116
i= 117
i= 118
i= 119
i= 120
i= 121
i= 122
i= 123
i= 124
i= 125
i= 126
i= 127
i= 128
i= 129
i= 130
i= 131
i= 132
i= 133
i= 134
i= 135
i= 136
i= 137
i= 138
i= 139
i= 140
i= 141
i= 142
i= 143
i= 144
i= 145
i= 146
i= 147
i= 148
i= 149
i= 150
i= 151
i= 152
i= 153
i= 154
i= 155
i= 156
i= 157
i= 1

In [None]:
#finetune encoder-decoder with lora - SEQ_2_SEQ_LM
!pip install peft
model_name="google/flan-t5-large"#"distilbert-base-uncased"#"cardiffnlp/twitter-roberta-base"#"roberta-base"#"bigscience/bloom-560m"#"roberta-large"#
peft_model_id = "cackerman/"+model_name.rsplit('/', 1)[-1]+"_aug_LORA_SEQ_2_SEQ_LM"
from peft import get_peft_model, LoraConfig, TaskType, get_peft_config, get_peft_model_state_dict
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSeq2SeqLM, default_data_collator, get_linear_schedule_with_warmup
from tqdm import tqdm
import os

peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, #SEQ_CLS
                         lora_dropout=0.1,target_modules=["q", "v"])#for t5#)#, target_modules=["q_lin", "v_lin"])#for distilbert (https://github.com/huggingface/peft/blob/main/src/peft/utils/other.py#L202)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)#, device_map='auto', torch_dtype=torch.float16)
model = get_peft_model(model, peft_config)
###model.print_trainable_parameters()

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.sep_token is None:
    tokenizer.sep_token = tokenizer.eos_token
    # resize model embedding to match new tokenizer
    model.resize_token_embeddings(len(tokenizer))
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id
    # resize model embedding to match new tokenizer
    model.resize_token_embeddings(len(tokenizer))
if "gpt" in model_name:
    tokenizer.padding_side = 'left'

text_column = "text"
label_column = "text_label"
max_length = 128#64
lr = 3e-4
num_epochs = 3
batch_size = 8

train_df = load_data(tokenizer.sep_token,True,'train')
ds = Dataset.from_pandas(train_df)
ds = ds.train_test_split(test_size=0.05, seed=42)
classes = ['Not a real disaster','Real disaster']
ds = ds.map(
    lambda x: {"text_label": [classes[label] for label in x["target"]]},
    batched=True,
    num_proc=1,
)
target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])

def preprocess_function(examples):
    inputs = examples[text_column]
    targets = examples[label_column]
    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = tokenizer(targets, max_length=target_max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    # Convert all torch.Tensors to lists or numpy arrays
    for key in model_inputs.keys():
        model_inputs[key] = model_inputs[key].numpy() # or .tolist()
    model_inputs["labels"] = labels.numpy() # or .tolist()
    return model_inputs

processed_datasets = ds.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=ds["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["test"]
    
train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
                
###checkpoint = torch.load(save_path)
###model.load_state_dict(checkpoint["model_state_dict"])
print(model.print_trainable_parameters())

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

for epoch in range(0,num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
#        if epoch==4 and step <= 704:
#          continue
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        # save model every save_steps
#        if step % save_steps == 0 and step > 0:
#          torch.save({
#              "model_state_dict": model.state_dict(),
#              "optimizer_state_dict": optimizer.state_dict(),
#              "scheduler_state_dict": lr_scheduler.state_dict()
#          }, save_path)

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"epoch: {epoch}, train_ppl: {train_ppl}, train_epoch_loss: {train_epoch_loss}, eval_ppl: {eval_ppl}, eval_epoch_loss: {eval_epoch_loss}")

correct = 0
total = 0
for pred, true in zip(eval_preds, ds["test"]["text_label"]):
    if pred.strip() == true.strip():
        correct += 1
    total += 1
accuracy = correct / total * 100
print(f"{accuracy} % on the evaluation dataset")
print(f"{eval_preds[:10]}")
print(f"{ds['test']['text_label'][:10]}")

import re

###run on eval set
decoded_outputs = []
def label_mapping(input_str):
    if input_str == "Not a real disaster":
        return 0
    elif input_str == "Real disaster":
        return 1
    return
for i, batch in enumerate(eval_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    input_ids = batch['input_ids'].to(device)
    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, max_new_tokens=10)
        outtext=tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)
        if i==0:
            print("inputs=",tokenizer.batch_decode(input_ids.detach().cpu().numpy(), skip_special_tokens=True))
            print("outtext=",outtext)
        decoded_outputs.extend(label_mapping(str) for str in outtext)

targets = ds["test"]["target"]
inputs = ds["test"]["text"]

data = {"id": ds["test"]["id"], "input_text": inputs, "predicted_label": decoded_outputs, "true_label": targets}
output_df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
output_df.to_csv(model_name.rsplit('/', 1)[-1]+'_aug_finetunedlora_classhead_cln_eval_short.csv', index=False)


###now do test set
test_df = load_data(tokenizer.sep_token,True,'test')
test_ds = Dataset.from_pandas(test_df)
def tok_func(x): return tokenizer(x["text"], max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
tok_ds = test_ds.map(tok_func)
tok_ds = tok_ds.remove_columns(test_ds.column_names)
test_dataloader = DataLoader(tok_ds, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

decoded_outputs = []
for i, batch in enumerate(test_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    input_ids = batch['input_ids'].to(device)
    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, max_new_tokens=10)
        outtext=tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)
        if i==0:
            print("inputs=",tokenizer.batch_decode(input_ids.detach().cpu().numpy(), skip_special_tokens=True))
            print("outtext=",outtext)
        decoded_outputs.extend(label_mapping(str) for str in outtext)

data = {"id": test_ds["id"], "input_text": test_ds['text'], "predicted_label": decoded_outputs}
output_df = pd.DataFrame(data)
output_df.to_csv(model_name.rsplit('/', 1)[-1]+'_aug_finetunedlora_classhead_cln_test.csv', index=False)
