In [1]:
!rm -rf /home/ec2-user/SageMaker/.Trash-1000/*

In [1]:
import pandas as pd
from pandarallel import pandarallel
import multiprocessing
import json
import random
from transformers import AutoTokenizer
from datasets import load_dataset
from torch.utils.data import Dataset



import os, torch, random
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_from_disk

from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split


MAX_CPUs = max(1, multiprocessing.cpu_count()-1)

pandarallel.initialize(nb_workers = MAX_CPUs , progress_bar=False)
    

config = {"max_length": 1656}

SEED = 13

def seed_everything(seed=13):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed = 13)

  from .autonotebook import tqdm as notebook_tqdm


INFO: Pandarallel will run on 191 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", use_fast = False)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_id = tokenizer.eos_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
def create_text(row, is_test = False):
    inst = row["instruction"].replace("###Task Description:\n","<|system|>\n")
    inst = inst.replace("\n\n###The", "<|end|>\n\n<|user|>\n###The")
    inst = inst.replace("\n\n###Feedback: ", "<|end|>\n\n<|assistant|>\n")
    
    if is_test: return inst
    else: out = row["output"].strip() + tokenizer.eos_token
    
    return inst + out
 
    
def generate_and_tokenize_prompt(prompt):                                                           
    result = tokenizer(
        prompt,
        truncation=True,
        max_length= config["max_length"],
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result                                                                 

In [13]:
data = pd.read_parquet("../prometheus_with_contexts.parquet")

data["text"] = data.parallel_apply(create_text, axis = 1)

print(data["text"].sample(1).values[0])

<|system|>
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
4. Please do not generate any other opening, closing, and explanations.<|end|>

<|user|>
###The instruction to evaluate:
I am working on a research project that involves predicting the impacts of climate change on the agricultural sector. I have data from various sources including meteorological observations, historical crop yields, farmer interviews, and satellite imagery. I am struggling to integrate this divers

In [14]:
train_df = data[data["split"] == "Train"]
val_df = data[data["split"] == "Test"]

In [15]:
train_dataset = Dataset.from_dict({"text": train_df["text"].values})
val_dataset = Dataset.from_dict({"text": val_df["text"].values})


tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt, input_columns = "text", num_proc = 8)
tokenized_val_dataset = val_dataset.map(generate_and_tokenize_prompt,input_columns = "text", num_proc = 8)

tokenized_train_dataset.save_to_disk("./tokenized_train_dataset")
tokenized_val_dataset.save_to_disk("./tokenized_val_dataset")

Map (num_proc=8): 100%|██████████| 94954/94954 [02:02<00:00, 772.88 examples/s] 
Map (num_proc=8): 100%|██████████| 4998/4998 [00:07<00:00, 708.79 examples/s] 
Saving the dataset (6/6 shards): 100%|██████████| 94954/94954 [00:09<00:00, 10219.01 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 4998/4998 [00:00<00:00, 9779.40 examples/s] 


In [16]:
len(tokenized_train_dataset[0]["input_ids"]), config["max_length"]

(1656, 1656)

In [19]:
print(tokenizer.decode(tokenized_train_dataset[0]["input_ids"], skip_special_tokens = False))

<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|