In [4]:
import utils

### Fix format for M1 preference data

In [5]:
# Load the preference data
path = 'datasets/M1_preference_data_15052024.json'
data = utils.read_json(path)

In [6]:
# Process the data on the corerct format for DPO training
def process_data(data):
    processed_data = []  # This will hold the processed data

    for item in data:
        question = item["question_complete"]
        for pref in item["preference"]:
            chosen = pref['A'] if pref["overall"] == 'A' else pref['B']
            rejected = pref['B'] if pref["overall"] == 'A' else pref['A']
            processed_data.append({
                "prompt": question,
                "chosen": chosen,
                "rejected": rejected
            })

    return processed_data

data = process_data(data)

In [7]:
# Write to jsonl file with correct data format
utils.write_jsonl(data, 'datasets/M1.jsonl')

### Fix format for SFT data

In [None]:
path = 'datasets/intents.json'
data = utils.read_json(path)

In [None]:
def process_intents(data):
    transformed_data = []
    for intent in data["intents"]:
        prompt = intent["patterns"][0]
        gold_output = intent["responses"][0]
        transformed_data.append({"Prompt": prompt, "gold_output": gold_output})
    return transformed_data

# Transform the original JSON data
intents_data = process_intents(data)


In [None]:
# Write to jsonl file with correct data format
utils.write_jsonl(intents_data, 'datasets/cs_sft.jsonl')

In [None]:
path = 'datasets/math_stack_exchange.json'
data = utils.read_json(path)

In [None]:
def process_math(data):
    transformed_data = []
    for item in data:
        prompt = item["question"]
        gold_output = item["chosen"]
        transformed_data.append({"Prompt": prompt, "gold_output": gold_output})
    return transformed_data

math_data = process_math(data)

In [None]:
# Save all sft data
sft_data = intents_data + math_data

In [None]:
# Write to jsonl file with correct data format
utils.write_jsonl(sft_data, 'datasets/sft.jsonl')

### Stack exchange script to get data

In [1]:
import pandas as pd
import json
from bs4 import BeautifulSoup

def convert_parquet_to_json(parquet_file_path, json_file_path):
    # Read the Parquet file
    df = pd.read_parquet(parquet_file_path)
    
    # Convert the DataFrame to a JSON string
    json_str = df.to_json(orient='records')
    
    # Write the JSON string to a file
    with open(json_file_path, 'w') as f:
        f.write(json_str)

def clean_html(text):
    soup = BeautifulSoup(text, 'html.parser')
    cleaned_text = ''
    for elem in soup.recursiveChildGenerator():
        if elem.name == 'code':  # Preserve LaTeX equations
            cleaned_text += f"${elem.get_text()}$"
        elif isinstance(elem, str):
            cleaned_text += elem
    return cleaned_text

def process_file(parquet_file, json_output_file, cleaned_output_file):
    # Convert Parquet to JSON
    convert_parquet_to_json(parquet_file, json_output_file)

    # Load the JSON data
    with open(json_output_file, 'r') as f:
        data = json.load(f)

    # Prepare the new JSON structure
    new_data = []

    for item in data:
        question = item['question']
        answers = item['answers']

        # Find the answer with the highest pm_score
        best_answer = max(answers, key=lambda x: x['pm_score'])

        # Only keep the answer if pm_score is >= 4
        if best_answer['pm_score'] >= 4:
            # Clean the HTML content
            cleaned_question = clean_html(question)
            cleaned_answer = clean_html(best_answer['text'])

            # Create the new JSON structure
            new_item = {
                "prompt": cleaned_question,
                "gold_output": cleaned_answer
            }
            new_data.append(new_item)

    # Save the new JSON data to a file
    with open(cleaned_output_file, 'w') as f:
        json.dump(new_data, f, indent=4)

    print(f"Data successfully processed and saved to {cleaned_output_file}")

# List of files to process
files_to_process = [
    ("datasets/DS_SE.parquet", "datasets/DS_SE.json", "datasets/processed_DS_SE_cleaned.json"),
    ("datasets/MATH_SE.parquet", "datasets/MATH_SE.json", "datasets/processed_MATH_SE_cleaned.json"),
    ("datasets/CS_SE.parquet", "datasets/CS_SE.json", "datasets/processed_CS_SE_cleaned.json"),
    ("datasets/PHYSICS_SE.parquet", "datasets/PHYSICS_SE.json", "datasets/processed_PHYSICS_SE_cleaned.json")
]

# Process each file
for parquet_file, json_output_file, cleaned_output_file in files_to_process:
    process_file(parquet_file, json_output_file, cleaned_output_file)


Data successfully processed and saved to datasets/processed_DS_SE_cleaned.json
Data successfully processed and saved to datasets/processed_MATH_SE_cleaned.json
Data successfully processed and saved to datasets/processed_CS_SE_cleaned.json
Data successfully processed and saved to datasets/processed_PHYSICS_SE_cleaned.json


In [3]:
import utils
data = []
for path in ["datasets/processed_DS_SE_cleaned.json", "datasets/processed_MATH_SE_cleaned.json",
             "datasets/processed_CS_SE_cleaned.json", "datasets/processed_PHYSICS_SE_cleaned.json"] :

    data = data + utils.read_json(path)

utils.write_jsonl(data, 'datasets/sft.jsonl')
    

### CHECKING LENGTHS AND CLEANING


In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
import utils
from unsloth import FastLanguageModel

_, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name_or_path,
    max_seq_length = max_seq_length,
    dtype = None, # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = False, # Use 4bit quantization to reduce memory usage. Can be False.
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

def calculate_dpo_lengths(data, tokenizer):
    lengths = []
    for item in data:
        prompt_tokens = len(tokenizer(item["prompt"], truncation=False)['input_ids'])
        chosen_tokens = len(tokenizer(item["chosen"], truncation=False)['input_ids'])
        rejected_tokens = len(tokenizer(item["rejected"], truncation=False)['input_ids'])
        max_length = max(prompt_tokens + chosen_tokens, prompt_tokens + rejected_tokens)
        lengths.append((max_length, item))
    return lengths

# Tokenize and calculate lengths for SFT data
def calculate_sft_lengths(data, tokenizer):
    lengths = []
    for item in data:
        prompt_tokens = len(tokenizer(item["prompt"], truncation=False)['input_ids'])
        chosen_tokens = len(tokenizer(item["gold_output"], truncation=False)['input_ids'])
        length = prompt_tokens + chosen_tokens
        lengths.append((length, item))
    return lengths

# Read the DPO and SFT data using utils
dpo_data = utils.read_jsonl("datasets/M1.jsonl")
sft_data = utils.read_jsonl("datasets/sft.jsonl")

# Process DPO data and calculate lengths
dpo_lengths = calculate_dpo_lengths(dpo_data, tokenizer)

# Calculate lengths for SFT data
sft_lengths = calculate_sft_lengths(sft_data, tokenizer)

# Filter data points not exceeding 2048 tokens
filtered_dpo_data = [item for length, item in dpo_lengths if length <= 2048]
filtered_sft_data = [item for length, item in sft_lengths if length <= 2048]

# Write filtered data to new JSONL files using utils
utils.write_jsonl(filtered_dpo_data, "datasets/M1_2048.jsonl")
utils.write_jsonl(filtered_sft_data, "datasets/sft_2048.jsonl")

# Generate histograms
dpo_lengths_values = [length for length, _ in dpo_lengths]
sft_lengths_values = [length for length, _ in sft_lengths]

plt.hist(dpo_lengths_values, bins=100, alpha=0.5, label='DPO lengths', range=(-1000, 5000))
plt.hist(sft_lengths_values, bins=100, alpha=0.5, label='SFT lengths', range=(-1000, 5000))
plt.xlabel('Length in tokens')
plt.ylabel('Number of data points')
plt.legend(loc='upper right')
plt.title('Token Length Distribution')
plt.xlim(-1000, 5000)
plt.show()

# Calculate the percentage of data points exceeding 2048 tokens
dpo_exceeding_2048 = sum(1 for length in dpo_lengths_values if length > 2048) / len(dpo_lengths_values) * 100
sft_exceeding_2048 = sum(1 for length in sft_lengths_values if length > 2048) / len(sft_lengths_values) * 100

print(f"Percentage of DPO data points exceeding 2048 tokens: {dpo_exceeding_2048:.2f}%")
print(f"Percentage of SFT data points exceeding 2048 tokens: {sft_exceeding_2048:.2f}%")

In [2]:
import json
import re


# Input and output file paths
input_file = 'datasets/M1_2000_no_mcqa_train.jsonl'
output_file = 'datasets/M1_2000_no_mcqa_train_correct.jsonl'

# The strings to be removed
start_string = "<|system|>\nYou are an experienced teacher who answers the STEM-related question asked by a student below.</s>\n<|user|>\n"
end_string = "</s>\n<|assistant|>\n"

special_chars_pattern = re.compile(r'^[\W_]+$')

# Function to process each prompt
def process_prompt(prompt):
    if prompt.startswith(start_string):
        prompt = prompt[len(start_string):]

    if prompt.endswith(end_string):
        prompt = prompt[:-len(end_string)]
    return prompt

# Read the input file and process each line
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
    for line in infile:
        data = json.loads(line)
        
        # Skip entries where "chosen" field contains only special characters
        if special_chars_pattern.match(data['chosen'].strip()):
            print(f"Skipping entry with special characters: {data['chosen']}")
            continue
        
        # Process the "prompt" field
        data['prompt'] = process_prompt(data['prompt'])
        
        # Write the modified data to the output file
        json.dump(data, outfile)
        outfile.write('\n')

print(f"Processed JSONL file saved as {output_file}")



Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special characters: ...
Skipping entry with special char

### SE_DPO

In [None]:
import pandas as pd
import json
from bs4 import BeautifulSoup

def convert_parquet_to_json(parquet_file_path, json_file_path):
    # Read the Parquet file
    df = pd.read_parquet(parquet_file_path)
    
    # Convert the DataFrame to a JSON string
    json_str = df.to_json(orient='records')
    
    # Write the JSON string to a file
    with open(json_file_path, 'w') as f:
        f.write(json_str)

def clean_html(text):
    soup = BeautifulSoup(text, 'html.parser')
    cleaned_text = ''
    for elem in soup.recursiveChildGenerator():
        if elem.name == 'code':
            # Preserve LaTeX equations
            cleaned_text += f"${elem.get_text()}$"
        elif isinstance(elem, str):
            cleaned_text += elem
    return cleaned_text

def process_file(parquet_file, json_output_file, cleaned_output_file):
    # Convert Parquet to JSON
    convert_parquet_to_json(parquet_file, json_output_file)
    
    # Load the JSON data
    with open(json_output_file, 'r') as f:
        data = json.load(f)
    
    # Prepare the new JSONL data
    new_data = []
    for item in data:
        question = item['question']
        answers = item['answers']
        
        # Skip if there is only one answer
        if len(answers) <= 1:
            continue
        
        # Find the answer with the highest pm_score
        best_answer = max(answers, key=lambda x: x['pm_score'])
        
        # Find the answer with the lowest pm_score
        worst_answer = min(answers, key=lambda x: x['pm_score'])
        
        # Only keep the data if best_answer pm_score is > 2
        if best_answer['pm_score'] > 2:
            # Clean the HTML content
            cleaned_question = clean_html(question)
            cleaned_best_answer = clean_html(best_answer['text'])
            cleaned_worst_answer = clean_html(worst_answer['text'])
            
            # Create the new JSONL structure
            new_item = {
                "prompt": cleaned_question,
                "chosen": cleaned_best_answer,
                "rejected": cleaned_worst_answer
            }
            new_data.append(json.dumps(new_item))
    
    # Save the new JSONL data to a file
    with open(cleaned_output_file, 'w') as f:
        f.write('\n'.join(new_data))
    
    print(f"Data successfully processed and saved to {cleaned_output_file}")

# List of files to process
files_to_process = [
    ("datasets/DS_SE.parquet", "datasets/DS_SE_DPO.json", "datasets/processed_DS_SE_cleaned_DPO.jsonl"),
    ("datasets/MATH_SE.parquet", "datasets/MATH_SE_DPO.json", "datasets/processed_MATH_SE_cleaned_DPO.jsonl"),
    ("datasets/CS_SE.parquet", "datasets/CS_SE_DPO.json", "datasets/processed_CS_SE_cleaned_DPO.jsonl"),
    ("datasets/PHYSICS_SE.parquet", "datasets/PHYSICS_SE_DPO.json", "datasets/processed_PHYSICS_SE_cleaned_DPO.jsonl")
]

# Process each file
for parquet_file, json_output_file, cleaned_output_file in files_to_process:
    process_file(parquet_file, json_output_file, cleaned_output_file)

In [None]:
import json
import random

# Set the input and output file paths
input_file = "datasets/SE_DPO_.jsonl"
train_file = "datasets/SE_DPO_train.jsonl"
test_file = "datasets/SE_DPO_test.jsonl"

# Set the split ratio (95% for training, 5% for testing)
train_ratio = 0.95

# Read the data from the input JSONL file
with open(input_file, "r") as file:
    data = [json.loads(line) for line in file]

# Shuffle the data randomly
random.shuffle(data)

# Calculate the split index
split_index = int(len(data) * train_ratio)

# Split the data into training and testing sets
train_data = data[:split_index]
test_data = data[split_index:]

# Write the training data to the train JSONL file
with open(train_file, "w") as file:
    for item in train_data:
        file.write(json.dumps(item) + "\n")

# Write the testing data to the test JSONL file
with open(test_file, "w") as file:
    for item in test_data:
        file.write(json.dumps(item) + "\n")

print(f"Split complete. Training data: {len(train_data)} items, Testing data: {len(test_data)} items.")

In [None]:
import utils
data = []
for path in ["datasets/processed_DS_SE_cleaned_DPO.jsonl", "datasets/processed_MATH_SE_cleaned_DPO.jsonl",
             "datasets/processed_CS_SE_cleaned_DPO.jsonl", "datasets/processed_PHYSICS_SE_cleaned_DPO.jsonl"] :

    data = data + utils.read_jsonl(path)

utils.write_jsonl(data, 'datasets/SE_DPO_.jsonl')
    

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
import utils
from unsloth import FastLanguageModel

_, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name_or_path,
    max_seq_length = max_seq_length,
    dtype = None, # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = False, # Use 4bit quantization to reduce memory usage. Can be False.
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

def calculate_dpo_lengths(data, tokenizer):
    lengths = []
    for item in data:
        prompt_tokens = len(tokenizer(item["prompt"], truncation=False)['input_ids'])
        chosen_tokens = len(tokenizer(item["chosen"], truncation=False)['input_ids'])
        rejected_tokens = len(tokenizer(item["rejected"], truncation=False)['input_ids'])
        max_length = max(prompt_tokens + chosen_tokens, prompt_tokens + rejected_tokens)
        lengths.append((max_length, item))
    return lengths

# Tokenize and calculate lengths for SFT data
def calculate_sft_lengths(data, tokenizer):
    lengths = []
    for item in data:
        prompt_tokens = len(tokenizer(item["prompt"], truncation=False)['input_ids'])
        chosen_tokens = len(tokenizer(item["gold_output"], truncation=False)['input_ids'])
        length = prompt_tokens + chosen_tokens
        lengths.append((length, item))
    return lengths

# Read the DPO and SFT data using utils
dpo_data = utils.read_jsonl("datasets/M1.jsonl")
sft_data = utils.read_jsonl("datasets/sft.jsonl")

# Process DPO data and calculate lengths
dpo_lengths = calculate_dpo_lengths(dpo_data, tokenizer)

# Calculate lengths for SFT data
sft_lengths = calculate_sft_lengths(sft_data, tokenizer)

# Filter data points not exceeding 2048 tokens
filtered_dpo_data = [item for length, item in dpo_lengths if length <= 2048]
filtered_sft_data = [item for length, item in sft_lengths if length <= 2048]

# Write filtered data to new JSONL files using utils
utils.write_jsonl(filtered_dpo_data, "datasets/M1_2048.jsonl")
utils.write_jsonl(filtered_sft_data, "datasets/sft_2048.jsonl")

# Generate histograms
dpo_lengths_values = [length for length, _ in dpo_lengths]
sft_lengths_values = [length for length, _ in sft_lengths]

plt.hist(dpo_lengths_values, bins=100, alpha=0.5, label='DPO lengths', range=(-1000, 5000))
plt.hist(sft_lengths_values, bins=100, alpha=0.5, label='SFT lengths', range=(-1000, 5000))
plt.xlabel('Length in tokens')
plt.ylabel('Number of data points')
plt.legend(loc='upper right')
plt.title('Token Length Distribution')
plt.xlim(-1000, 5000)
plt.show()

# Calculate the percentage of data points exceeding 2048 tokens
dpo_exceeding_2048 = sum(1 for length in dpo_lengths_values if length > 2048) / len(dpo_lengths_values) * 100
sft_exceeding_2048 = sum(1 for length in sft_lengths_values if length > 2048) / len(sft_lengths_values) * 100

print(f"Percentage of DPO data points exceeding 2048 tokens: {dpo_exceeding_2048:.2f}%")
print(f"Percentage of SFT data points exceeding 2048 tokens: {sft_exceeding_2048:.2f}%")

In [None]:
import json
import random

# Set the input and output file paths
input_file = "datasets/SE_DPO_.jsonl"
train_file = "datasets/SE_DPO_train.jsonl"
test_file = "datasets/SE_DPO_test.jsonl"

# Set the split ratio (95% for training, 5% for testing)
train_ratio = 0.95

# Read the data from the input JSONL file
with open(input_file, "r") as file:
    data = [json.loads(line) for line in file]

# Shuffle the data randomly
random.shuffle(data)

# Calculate the split index
split_index = int(len(data) * train_ratio)

# Split the data into training and testing sets
train_data = data[:split_index]
test_data = data[split_index:]

# Write the training data to the train JSONL file
with open(train_file, "w") as file:
    for item in train_data:
        file.write(json.dumps(item) + "\n")

# Write the testing data to the test JSONL file
with open(test_file, "w") as file:
    for item in test_data:
        file.write(json.dumps(item) + "\n")

print(f"Split complete. Training data: {len(train_data)} items, Testing data: {len(test_data)} items.")



import utils
data = []
for path in ["datasets/processed_DS_SE_cleaned_DPO.jsonl", "datasets/processed_MATH_SE_cleaned_DPO.jsonl",
             "datasets/processed_CS_SE_cleaned_DPO.jsonl", "datasets/processed_PHYSICS_SE_cleaned_DPO.jsonl"] :

    data = data + utils.read_jsonl(path)

utils.write_jsonl(data, 'datasets/SE_DPO_.jsonl')

In [4]:
import utils
import random as random
def merge_jsonl(file1, file2, output_file, num_samples=10000):
    data1 = utils.read_jsonl(file1)
    data2 = utils.read_jsonl(file2)
    
    sample1 = random.sample(data1, num_samples)
    sample2 = random.sample(data2, num_samples)
    
    merged_data = sample1 + sample2
    random.shuffle(merged_data)  # Shuffle to ensure no order preference
    
    utils.write_jsonl(merged_data, output_file)

# Usage
file1 = 'datasets/M1_2000_no_mcqa_train_correct.jsonl'
file2 = 'datasets/SE_DPO_train_2000.jsonl'
output_file = 'datasets/merged_DPO.jsonl'

merge_jsonl(file1, file2, output_file)

In [6]:
file1 = 'datasets/M1_2000_no_mcqa_train_correct.jsonl'
file2 = 'datasets/SE_DPO_train_2000.jsonl'
output_file = 'datasets/merged_DPO_train.jsonl'

merge_jsonl(file1, file2, output_file, num_samples=9000)

In [8]:
file1 = 'datasets/M1_2000_no_mcqa_test_correct.jsonl'
file2 = 'datasets/SE_DPO_test_2000.jsonl'
output_file = 'datasets/merged_DPO_test.jsonl'

merge_jsonl(file1, file2, output_file, num_samples=1000)

In [10]:
for i,x in enumerate([1,2,3,4,5,6,7], 3):
    print(i)

3
4
5
6
7
8
9


In [None]:
MCQA_merged = utils.read_jsonl('datasets/MCQA_annotated.jsonl.jsonl') + utils.read_jsonl('datasets/MCQA_annotated_2.jsonl.jsonl')
first = 0
last = 0
counter_dict = {"A": 0, "B": 0, "C": 0, "D": 0}
first_prompt = MCQA_merged[0]['prompt']

for datapoint in MCQA_merged:
    if first_prompt == datapoint['prompt']:
        counter_dict[datapoint["chosen"]]  += 1
        last = last + 1
    
    
    else:
        first_prompt = datapoint['prompt']
        counter_dict = {"A": 0, "B": 0, "C": 0, "D": 0}
    


In [25]:
mcqa = utils.read_jsonl('datasets/MCQA_annotated.jsonl') + utils.read_jsonl('datasets/MCQA_annotated_2.jsonl')
mcqa[0]
alt = {chr(i): 0 for i in range(ord('A'), ord('Z')+1)}
new_mcqa = []
for i in range(1,len(mcqa)):
    if mcqa[i]["prompt"] == mcqa[i-1]["prompt"]:
        alt[mcqa[i]["chosen"].upper()] += 1
    else: 
        max_alt = max(alt, key=alt.get)
        new_mcqa.append({"question": mcqa[i-1]["prompt"], "answer": max_alt})
        alt = {chr(i): 0 for i in range(ord('A'), ord('Z')+1)}

new_mcqa.append({"question": mcqa[-1]["prompt"], "answer": max_alt})

In [26]:
utils.write_jsonl(new_mcqa, 'datasets/MCQA_unique_data.jsonl')

In [32]:
mcqa = utils.read_jsonl('datasets/MCQA_annotated.jsonl') + utils.read_jsonl('datasets/MCQA_annotated_2.jsonl')

first = 0
last = 0
alt = {chr(i): 0 for i in range(ord('A'), ord('Z')+1)}
for i in range(1, len(mcqa)):
    if mcqa[i]["prompt"] == mcqa[i-1]["prompt"]:
        alt[mcqa[i]["chosen"].upper()] += 1
        last = i
    else:
        max_alt = max(alt, key=alt.get) 
        for j in range(first,last+1):
            mcqa[j]["chosen"] = max_alt
        alt = {chr(i): 0 for i in range(ord('A'), ord('Z')+1)}
        first = i
        
max_alt = max(alt, key=alt.get)
for j in range(first,last):
    mcqa[j]["chosen"] = max_alt

In [33]:
utils.write_jsonl(mcqa, 'datasets/MCQA_correct_answer.jsonl')