## NOTE

The structure of this notebook is nearly identical to the notebook where I fine-tuned the model on persuasive essays. Please look at that notebook for quite detailed commentary. I add further commentary here sparingly.

In [None]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install transformers
!pip install accelerate
!pip install peft
!pip install datasets
!pip install bitsandbytes



In [None]:
import torch
from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          BitsAndBytesConfig,
                          Trainer,
                          TrainingArguments)
from datasets import load_dataset
from peft import (LoraConfig,
                  PeftConfig,
                  PeftModel,
                  get_peft_model,
                  prepare_model_for_kbit_training)

In [None]:
base_model = "meta-llama/Meta-Llama-3-8B-Instruct"

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= 'nf4',
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= True,
)

In [1]:
# enable AWS functionalities
!pip install boto3
!pip install s3fs


Collecting boto3
  Downloading boto3-1.34.102-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<1.35.0,>=1.34.102 (from boto3)
  Downloading botocore-1.34.102-py3-none-any.whl (12.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.2/12.2 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3)
  Downloading s3transfer-0.10.1-py3-none-any.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.2/82.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jmespath, botocore, s3transfer, boto3
Successfully installed boto3-1.34.102 botocore-1.34.102 jmespath-1.0.1 s3transfer-0.10.1
Collecting s3fs
  Downloading s3fs-2024.3.1-py3-none-any.whl (29 kB)
Collect

In [2]:
import pandas as pd
train_source_dependent_model_input = pd.read_csv('s3://698modeldata/train_source_dependent_model_input.csv')
eval_source_dependent_model_input = pd.read_csv('s3://698modeldata/eval_source_dependent_model_input.csv')
test_source_dependent_model_input = pd.read_csv('s3://698modeldata/test_source_dependent_model_input.csv')


In [None]:
train_source_dependent_model_input['target_score'].describe() # as I recalled, range is 0 to 4, so we need 5 labels

count    9944.000000
mean        1.956255
std         1.019964
min         0.000000
25%         1.000000
50%         2.000000
75%         3.000000
max         4.000000
Name: target_score, dtype: float64

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
        base_model,
        num_labels=5,
        quantization_config=bnb_config,
        device_map='auto',
        trust_remote_code=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=2,
    bias='lora_only',
    task_type='SEQ_CLS',
    target_modules=['q_proj', 'v_proj', 'k_proj']
)

model = get_peft_model(model, peft_config)


In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
import torch
from torch.utils.data import Dataset
import torch.nn.functional as F

class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, feature_columns, max_length=2000, max_score=5):  # max_length 2000
        self.tokenizer = tokenizer
        self.data = df
        self.feature_columns = feature_columns
        self.max_length = max_length

        # Truncate from the beginning if necessary (less critical)
        self.encodings = tokenizer(list(df['final_input']), truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt", truncation_strategy='only_first')

        self.features = torch.tensor(df[self.feature_columns].values, dtype=torch.float32)

        self.labels = torch.tensor(df['target_score'].tolist(), dtype=torch.long)
        self.labels = F.one_hot(self.labels, num_classes=max_score).float()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['features'] = self.features[idx]
        item['labels'] = self.labels[idx]
        return item

# List of encoded
feature_columns = ['score_type_rater1_domain1', 'score_type_rater2_domain1']


In [None]:
test_source_dependent_model_input = test_source_dependent_model_input.dropna(subset=['target_score']) #it's only 2 rows


In [None]:
from torch.utils.data import DataLoader

# instances of the dataset for training, evaluation, and testing
train_dataset = CustomDataset(train_source_dependent_model_input, tokenizer, feature_columns)
eval_dataset = CustomDataset(eval_source_dependent_model_input, tokenizer, feature_columns)
test_dataset = CustomDataset(test_source_dependent_model_input, tokenizer, feature_columns)

# DataLoaders for each dataset
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, pin_memory=True)
eval_loader = DataLoader(eval_dataset, batch_size=8, shuffle=False, pin_memory=True)

test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)





In [None]:
model_path = '/content/drive/MyDrive/DATA698/models/best_sd_model'


In [None]:
from transformers import Trainer, TrainingArguments

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model.config.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

# Same arguments as persuasive model - staying consistent

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    report_to="none",
    fp16=True,  # mixed precision
)

# Initialize  Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Train
trainer.train()

# Save model + tokenizer
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss
0,0.3646,0.339361
2,0.2867,0.300355




Epoch,Training Loss,Validation Loss
0,0.3646,0.339361
2,0.2338,0.284898




('/content/drive/MyDrive/DATA698/models/best_sd_model/tokenizer_config.json',
 '/content/drive/MyDrive/DATA698/models/best_sd_model/special_tokens_map.json',
 '/content/drive/MyDrive/DATA698/models/best_sd_model/tokenizer.json')

In [None]:
class InferenceDataset(Dataset):
    def __init__(self, df, tokenizer, feature_columns, max_length=2000):
        self.tokenizer = tokenizer
        self.data = df
        self.feature_columns = feature_columns
        self.max_length = max_length
        self.essay_ids = df['essay_id'].tolist()

        # Tokenization
        self.encodings = tokenizer(list(df['final_input']), truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt", truncation_strategy='only_first')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['essay_id'] = self.essay_ids[idx]
        # Determine  active feature
        active_feature = next((f for f in self.feature_columns if self.data.iloc[idx][f] == 1), None)
        item['feature'] = active_feature
        return item

test_dataset = InferenceDataset(df=test_source_dependent_model_input, tokenizer=tokenizer, feature_columns=feature_columns)
test_item = test_dataset[0]
print(test_item.keys())  # Check keys again


dict_keys(['input_ids', 'attention_mask', 'essay_id', 'feature'])


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [None]:
def custom_collate(batch):
    collated_batch = {key: torch.stack([item[key] for item in batch]) for key in batch[0] if key in ['input_ids', 'attention_mask', 'labels']}
    # Non-tensor data
    for key in ['essay_id', 'feature']:
        collated_batch[key] = [item[key] for item in batch]
    return collated_batch

# Update DataLoader
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=custom_collate)


In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=2, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=2, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
           

In [None]:
# go to eval mode
model.eval()
predictions_info = []

with torch.no_grad():
    for batch in test_loader:
        inputs = batch['input_ids'].to(device)
        outputs = model(inputs)
        predictions = outputs.logits.argmax(dim=-1)

        for i in range(len(predictions)):
            pred_info = {
                'essay_id': batch['essay_id'][i],
                'feature': batch['feature'][i],
                'prediction': predictions[i].item()
            }
            predictions_info.append(pred_info)

#convert to df
import pandas as pd
sd_predictions_df = pd.DataFrame(predictions_info)
print(sd_predictions_df)



  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


      essay_id                    feature  prediction
0        15783  score_type_rater1_domain1           4
1        15783  score_type_rater2_domain1           4
2         9977  score_type_rater1_domain1           0
3         9977  score_type_rater2_domain1           0
4        13140  score_type_rater1_domain1           2
...        ...                        ...         ...
2125      7116  score_type_rater2_domain1           2
2126     14948  score_type_rater1_domain1           1
2127     14948  score_type_rater2_domain1           1
2128     10474  score_type_rater1_domain1           2
2129     10474  score_type_rater2_domain1           2

[2130 rows x 3 columns]


In [None]:
save_path = '/content/drive/MyDrive/DATA698/models/predictions/sd_predictions_df.csv'


In [None]:
sd_predictions_df.to_csv(save_path, index=False)


In [None]:
test_data = pd.read_csv('s3://698modeldata/test_source_dependent_model_input.csv')

In [None]:
def get_active_feature(row, feature_columns):
    for feature in feature_columns:
        if row[feature] == 1:
            return feature
    return None

test_data['feature'] = test_data.apply(lambda row: get_active_feature(row, feature_columns), axis=1)


In [None]:
merged_test = pd.merge(sd_predictions_df, test_data, on=['essay_id', 'feature'], how='left')


In [None]:
merged_test.columns

Index(['essay_id', 'feature', 'prediction', 'essay_set', 'essay',
       'rater1_domain1', 'rater2_domain1', 'domain1_score', 'grade_level',
       'dale_chall_score', 'complexity_difference', 'pos_proportions',
       'dependency_proportions', 'character_count_scaled', 'word_count_scaled',
       'error_count_scaled', 'error_to_word_ratio_scaled', 'target_score',
       'actual_criteria', 'actual_range', 'final_input',
       'score_type_rater1_domain1', 'score_type_rater2_domain1'],
      dtype='object')

In [None]:
final_df = merged_test.loc[:, ['essay_id', 'feature', 'prediction', 'target_score']]
final_df.head()

Unnamed: 0,essay_id,feature,prediction,target_score
0,15783,score_type_rater1_domain1,4,4.0
1,15783,score_type_rater2_domain1,4,4.0
2,9977,score_type_rater1_domain1,0,0.0
3,9977,score_type_rater2_domain1,0,0.0
4,13140,score_type_rater1_domain1,2,1.0


In [None]:
save_path = '/content/drive/MyDrive/DATA698/models/predictions/final_sd_predictions_df.csv'
final_df.to_csv(save_path, index=False)

