In [1]:
from tqdm.auto import tqdm
from datasets import Dataset, load_dataset
import numpy as np
import pandas as pd
import os
from pathlib import Path
import gc

Functions

In [2]:
def get_first_message_content(example):
    # Access the first message's content from the conversation
    if example['conversation'] and len(example['conversation']) > 0:
        return example['conversation'][0]['content']
    return None

def process_judgment_scores(input_string, weights=None):
    """
    Convert a string representation of scores into a weighted sum.

    Args:
        input_string (str): String representation of numpy array (e.g. '[0.1 0.2 0.3]')
        weights (array-like, optional): Weights for each position. Defaults to [1-10]

    Returns:
        float: Rounded weighted sum of scores

    Raises:
        ValueError: If input string format is invalid or dimensions don't match
        TypeError: If input is not a string
    """
    try:
        # Check input type
        if not isinstance(input_string, str):
            return -1

        # Set default weights if none provided
        if weights is None:
            weights = np.array([1,2,3,4,5,6,7,8,9,10], dtype=float)
        else:
            weights = np.array(weights, dtype=float)

        # Convert string to numpy array
        try:
            scores = np.fromstring(input_string.strip('[]'), sep=' ').astype(float)
        except:
            return -1

        # Check dimensions match
        if len(scores) != len(weights):
            return -1
            # raise ValueError(f"Score array length ({len(scores)}) does not match weights length ({len(weights)})")

        # Calculate weighted sum and round to 2 decimal places
        mean = np.round(np.sum(scores * weights), 2)

        # Variance
        second_moment = np.sum(np.power(weights - mean, 2) * scores)

        # Calculate fourth moment
        fourth_moment = np.sum(np.power(weights - mean, 4) * scores)

        # Calculate entropy (avoiding log(0))
        entropy = -np.sum(np.where(scores > 0, scores * np.log2(scores), 0))
        max_entropy = np.log2(10)  # log2(10) for 10 possible weights
        normalized_entropy = entropy / max_entropy

        return mean, np.round(second_moment, 3), round(float(fourth_moment * (1 - normalized_entropy)), 3)

    except (ValueError, TypeError) as e:
        raise type(e)(f"Error processing judgment scores: {str(e)}")

def is_unique(example):
    content = get_first_message_content(example)
    if content is None:
        return False
    if content in seen_contents:
        return False
    seen_contents.add(content)
    return True

Write token

In [None]:
from huggingface_hub import login
login(token ="")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Load datasets

In [4]:
# jdgfct="Readability"

# merged_df=pd.read_csv("/scratch/bf996/sc_data/merged_df.csv")

jdgfct="Completeness"

a = f"nyu-dice-lab/meta-llama_Llama-3.1-70B-Instruct-jdgfct-{jdgfct}"
b = f"nyu-dice-lab/nvidia_NVLM-D-72B-jdgfct-{jdgfct}"
c = f"nyu-dice-lab/Qwen_Qwen2.5-72B-Instruct-jdgfct-{jdgfct}"
d = f"nyu-dice-lab/neuralmagic_Llama-3.1-Nemotron-70B-Instruct-HF-FP8-dynamic-jdgfct-{jdgfct}"
e = f"nyu-dice-lab/Nexusflow_Athene-70B-jdgfct-{jdgfct}"

ds1 = load_dataset(a, split='train')
ds2 = load_dataset(b, split='train')
ds3 = load_dataset(c, split='train')
ds4 = load_dataset(d, split='train')
ds5 = load_dataset(e, split='train')

Resolving data files:   0%|          | 0/71 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/71 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/69 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/69 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/69 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/67 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/58 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/58 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/57 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/67 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/67 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/66 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/59 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/59 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/58 [00:00<?, ?it/s]

Process and merge datasets

In [5]:
# Create a set of unique first message contents
seen_contents = set()

# Filter the dataset to keep only rows with unique first messages
ds1 = ds1.filter(is_unique)

indices = (set(ds1['conversation_hash'])
.intersection(ds2['conversation_hash'])
.intersection(ds3['conversation_hash'])
.intersection(ds4['conversation_hash'])
.intersection(ds5['conversation_hash']))

pd_df = ds1.to_pandas()
selected_ds1 = pd_df[pd_df['conversation_hash'].isin(indices)].sort_values('conversation_hash')
print("selected ds1")
pd_df = ds2.to_pandas()
selected_ds2 = pd_df[pd_df['conversation_hash'].isin(indices)].sort_values('conversation_hash')
print("selected ds2")
pd_df = ds3.to_pandas()
selected_ds3 = pd_df[pd_df['conversation_hash'].isin(indices)].sort_values('conversation_hash')
print("selected ds3")
pd_df = ds4.to_pandas()
selected_ds4 = pd_df[pd_df['conversation_hash'].isin(indices)].sort_values('conversation_hash')
print("selected ds4")
pd_df = ds5.to_pandas()
selected_ds5 = pd_df[pd_df['conversation_hash'].isin(indices)].sort_values('conversation_hash')
print("selected ds5")
del ds1
del ds2
del ds3
del ds4
del ds5
del pd_df
gc.collect()
print("Done building selected ds1 to ds5 in Pandas")

#remove unused data for space
# def clean(df):
#     keys = ["content", f"judgment_meta-llama_Llama-3.1-8B-Instruct_conversation_{jdgfct}_logprob", "role"]
#     for conv in df['conversation']:
#         for conv_dict in conv:
#             toremove = []
#             for key in conv_dict:
#                 if key not in keys:
#                     toremove.append(key)
#             for k in toremove:
#                 if k in conv_dict:
#                     del conv_dict[k]

# clean(selected_ds1)
# print("cleaned ds1")
# clean(selected_ds2)
# print("cleaned ds2")
# clean(selected_ds3)
# print("cleaned ds3")
# clean(selected_ds4)
# print("cleaned ds4")
# clean(selected_ds5)
# print("cleaned ds5")

Filter:   0%|          | 0/984521 [00:00<?, ? examples/s]

selected ds1
selected ds2
selected ds3
selected ds4
selected ds5
Done building selected ds1 to ds5 in Pandas


Create DPO

In [8]:
from collections import Counter
# Create an empty dictionary with the specified fields
dpo_dataset = {
    'system': [],
    'prompt': [],
    'chosen': [],
    'rejected': [],
    'chosen_score': [],
    'rejected_score': [],
}

# for row1, row2 in tqdm(zip(ds1, ds2)):
t_idx = 0

for row1, row2, row3, row4, row5 in tqdm(zip(selected_ds1.iterrows(), selected_ds2.iterrows(), selected_ds3.iterrows(),
                                                selected_ds4.iterrows(), selected_ds5.iterrows())):
    l1, l2, l3, l4 = len(dpo_dataset['system']), len(dpo_dataset['prompt']), len(dpo_dataset['chosen']), len(dpo_dataset['rejected'])
    assert l1 == l2 == l3 == l4, f"Lengths do not match. {l1}:{l2}:{l3}:{l4}"
    t_idx += 1
    #conversation should be a list of dicts.
    conv1 = row1[1]['conversation']
    conv2 = row2[1]['conversation']
    conv3 = row3[1]['conversation']
    conv4 = row4[1]['conversation']
    conv5 = row5[1]['conversation']
    convs = [conv1, conv2, conv3, conv4, conv5]
    #get most common length thats not 0
    include = []
    for i in range(len(convs)):
        if len(convs[i])%2 == 0 and len(convs[i]) != 0:
            include.append(i)
    lengths = []
    for i in include:
        lengths.append(len(convs[i]))
    length_counts = Counter(lengths)
    most_common_length_info = length_counts.most_common(1)
    length = 0
    if most_common_length_info:
        length = most_common_length_info[0][0]
    #filter for common length match
    include = []
    for i in range(len(convs)):
        if len(convs[i]) == length and length != 0:
            include.append(i)
    if len(include)<2:
        continue
    #above is messy, optimize later
    for i in range(0, len(convs[include[0]]), 2):
        system = "You are a helpful assistant."
        question_until_now = "User: " + convs[include[0]][0]["content"]
        for j in range(1, i+1):
            if j % 2 == 0:
                question_until_now += "User: "
            else:
                question_until_now += "Assistant: "
            question_until_now += convs[include[0]][j]["content"]
        #question = question_until_now.replace("<|begin_of_text|>", "").replace("<|END_OF_CONVERSATION|>", "").replace("<|eot_id|>", "")
        #for format expected by open instruct
        prompt = convs[include[0]][0]
        score_dicts = []
        if 0 in include:
            #resp1 = conv1[i+1]["content"].replace("<|begin_of_text|>", "").replace("<|END_OF_CONVERSATION|>", "").replace("<|eot_id|>", "")
            #for open instruct
            resp1 = conv1[i+1]
            #Peak Separation Index, Entropy-weighted kurtosis
            score1, var1, ewk1 = process_judgment_scores(conv1[i+1][f"judgment_meta-llama_Llama-3.1-8B-Instruct_conversation_{jdgfct}_logprob"])
            score_dict1={
                "mean" : score1,
                "variance" : var1,
                "entropy-weighted kurtosis" : ewk1,
                "response" : resp1
            }
            score_dicts.append(score_dict1)
        if 1 in include:
            #resp2 = conv2[i+1]["content"].replace("<|begin_of_text|>", "").replace("<|END_OF_CONVERSATION|>", "").replace("<|eot_id|>", "")
            #for open instruct
            resp2 = conv2[i+1]
            score2, var2, ewk2 = process_judgment_scores(conv2[i+1][f"judgment_meta-llama_Llama-3.1-8B-Instruct_conversation_{jdgfct}_logprob"])
            score_dict2={
                "mean" : score2,
                "variance" : var2,
                "entropy-weighted kurtosis" : ewk2,
                "response" : resp2
            }
            score_dicts.append(score_dict2)
        if 2 in include:
            #resp3 = conv3[i+1]["content"].replace("<|begin_of_text|>", "").replace("<|END_OF_CONVERSATION|>", "").replace("<|eot_id|>", "")
            #for open instruct
            resp3 = conv3[i+1]
            score3, var3, ewk3 = process_judgment_scores(conv3[i+1][f"judgment_meta-llama_Llama-3.1-8B-Instruct_conversation_{jdgfct}_logprob"])
            score_dict3={
                "mean" : score3,
                "variance" : var3,
                "entropy-weighted kurtosis" : ewk3,
                "response" : resp3
            }
            score_dicts.append(score_dict3)
        if 3 in include:
            #resp4 = conv4[i+1]["content"].replace("<|begin_of_text|>", "").replace("<|END_OF_CONVERSATION|>", "").replace("<|eot_id|>", "")
            #for open instruct
            resp4 = conv4[i+1]
            score4, var4, ewk4 = process_judgment_scores(conv4[i+1][f"judgment_meta-llama_Llama-3.1-8B-Instruct_conversation_{jdgfct}_logprob"])
            score_dict4={
                "mean" : score4,
                "variance" : var4,
                "entropy-weighted kurtosis" : ewk4,
                "response" : resp4
            }
            score_dicts.append(score_dict4)
        if 4 in include:
            #resp5 = conv5[i+1]["content"].replace("<|begin_of_text|>", "").replace("<|END_OF_CONVERSATION|>", "").replace("<|eot_id|>", "")
            #for open instruct
            resp5 = conv5[i+1]
            score5, var5, ewk5 = process_judgment_scores(conv5[i+1][f"judgment_meta-llama_Llama-3.1-8B-Instruct_conversation_{jdgfct}_logprob"])
            score_dict5={
                "mean" : score5,
                "variance" : var5,
                "entropy-weighted kurtosis" : ewk5,
                "response" : resp5
            }
            score_dicts.append(score_dict5)
        #Take the response pair with the highest delta
        score_dicts = [score_dict for score_dict in score_dicts if score_dict["mean"] > 0]
        if len(score_dicts) < 2:
            continue
        else:
          min, max = None, None
          for score_dict in score_dicts:
            if min is None or score_dict["mean"] < min["mean"]:
              min = score_dict
            if max is None or score_dict["mean"] > max["mean"]:
              max = score_dict
          if abs(min["mean"] - max["mean"]) < 0.5:
            continue
          else:
            chosen = max["response"]
            chosen_score = max["mean"]
            rejected = min["response"]
            rejected_score = min["mean"]

        dpo_dataset['system'].append(system)
        dpo_dataset['prompt'].append(prompt)
        dpo_dataset['chosen'].append(chosen)
        dpo_dataset['chosen_score'].append(chosen_score)
        dpo_dataset['rejected'].append(rejected)
        dpo_dataset['rejected_score'].append(rejected_score)

0it [00:00, ?it/s]

  entropy = -np.sum(np.where(scores > 0, scores * np.log2(scores), 0))
  entropy = -np.sum(np.where(scores > 0, scores * np.log2(scores), 0))


In [9]:
len(dpo_dataset['chosen'])

601324

In [None]:
#Create the dataset
dpo_dataset_ds = Dataset.from_dict(dpo_dataset)

dpo_dataset_ds.push_to_hub(f"chardizard/dpo-mix5-OpenInstruct-Llama3-{jdgfct}")

#system (str), question (str), chosen (str), rejected (str)

print("Done.")