In [None]:
!rm -rf /home/ec2-user/SageMaker/.Trash-1000/*

In [1]:
import pandas as pd
from pandarallel import pandarallel
import multiprocessing
import json
import random
from transformers import AutoTokenizer
from datasets import load_dataset
from torch.utils.data import Dataset



import os, torch, random
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_from_disk

from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split


MAX_CPUs = max(1, multiprocessing.cpu_count()-1)

pandarallel.initialize(nb_workers = MAX_CPUs , progress_bar=False)
    

config = {"max_length": 1488} # 99 Percentile with Prompt. 1656 for CAUSAL

id2label = {0:1, 1:2, 2:3, 3:4, 4:5}
label2id = {5:4, 4:3, 3:2, 2:1, 1:0}

SEED = 13

def seed_everything(seed=13):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed = 13)

  from .autonotebook import tqdm as notebook_tqdm


INFO: Pandarallel will run on 191 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", use_fast = False)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_id = tokenizer.eos_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
data = pd.read_parquet("../prometheus_with_contexts.parquet")
data.sample(1)

Unnamed: 0,instruction,output,orig_instruction,orig_criteria,orig_score1_description,orig_score2_description,orig_score3_description,orig_score4_description,orig_score5_description,orig_response,orig_reference_answer,orig_feedback,orig_score,input,context_inst,context_resp,context_inst_resp,split
30609,###Task Description:\nAn instruction (might in...,The response demonstrates a general understand...,I'm working on a project that involves creatin...,How well does the model handle ambiguity and v...,The model cannot handle ambiguous or vague inp...,The model struggles with ambiguous or vague in...,The model generally handles ambiguous or vague...,The model handles ambiguous or vague inputs we...,The model expertly handles ambiguous or vague ...,"To handle ambiguous or vague inputs, the chatb...","To handle ambiguous or vague inputs, the chatb...",The response demonstrates a general understand...,3,,[Chatbots can be used to provide users with a ...,"[To minimize the need for human feedback, a he...","[Unlike word processors, NLP considers the str...",Test


In [4]:
def create_text(row, is_test = False):
    inst = row["instruction"].replace("###Task Description:\n","<|system|>\n")
    inst = inst.replace("\n\n###The", "<|end|>\n\n<|user|>\n###The")
    inst = inst.replace("\n\n###Feedback: ", "<|end|>\n\n<|assistant|>")
    
    return inst
 
    
def generate_and_tokenize_prompt(prompt):                                                           
    result = tokenizer(
        prompt,
        truncation=True,
        max_length= config["max_length"],
        padding="max_length",
    )
    return result                                                                 

In [5]:
data["text"] = data.parallel_apply(create_text, axis = 1)
data["label"] = data["orig_score"].parallel_apply(lambda x: int(x)-1)

print(data["text"].sample(1).values[0])

<|system|>
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
4. Please do not generate any other opening, closing, and explanations.<|end|>

<|user|>
###The instruction to evaluate:
I am working on a research project that involves predicting the impacts of climate change on the agricultural sector. I have data from various sources including meteorological observations, historical crop yields, farmer interviews, and satellite imagery. I am struggling to integrate this divers

In [6]:
train_df = data[data["split"] == "Train"]

val_df = data[data["split"] == "Test"]


print(train_df.shape[0]/(train_df["label"].value_counts().sort_index()*train_df["label"].nunique()), "\n") # Weights for Loss balance

train_df.loc[:,["orig_score","label"]].sample(10)

label
0    1.000042
1    0.998937
2    1.003000
3    1.001149
4    0.996892
Name: count, dtype: float64 



Unnamed: 0,orig_score,label
7940,5,4
63636,2,1
24096,3,2
9014,2,1
81223,4,3
25796,2,1
91060,4,3
28140,1,0
64938,1,0
32704,5,4


In [7]:
# leng = data["text"].parallel_apply(lambda x : len(tokenizer(x).input_ids))
# np.quantile(leng, 0.975), np.quantile(leng, 0.99), np.quantile(leng, 0.999)

In [8]:
train_dataset = Dataset.from_dict({"text":train_df["text"].values, "label":train_df["label"].values})
val_dataset = Dataset.from_dict({"text":val_df["text"].values, "label":val_df["label"].values})


tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt, input_columns = "text", num_proc = 24)
tokenized_val_dataset = val_dataset.map(generate_and_tokenize_prompt,input_columns = "text", num_proc = 24)

tokenized_train_dataset.save_to_disk("./tokenized_train_dataset")
tokenized_val_dataset.save_to_disk("./tokenized_val_dataset")

Map (num_proc=24): 100%|██████████| 94954/94954 [01:34<00:00, 1007.53 examples/s]
Map (num_proc=24): 100%|██████████| 4998/4998 [00:07<00:00, 678.86 examples/s]
Saving the dataset (3/3 shards): 100%|██████████| 94954/94954 [00:11<00:00, 8622.21 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 4998/4998 [00:00<00:00, 20809.68 examples/s]
