In [5]:
import json
import pandas as pd
import glob as glob

methods = {
    "Baseline": "qwen-2.5-x.5B-instruct-countdown-baseline",
    "OP": "qwen-2.5-x.5B-instruct-sft-lora-countdown-optimal-seq8k-5k",
    "SoS": "qwen-2.5-x.5B-instruct-sft-lora-countdown-search-seq8k-5k", 
    "RSoS": "qwen-2.5-x.5B-instruct-sft-lora-countdown-search-react-correct-seq10k-5k", 
    "Distill": "qwen-2.5-x.5B-instruct-sft-lora-countdown-deepseek-correct-seq8k-5k"
}

tasks = {
    "Countdown:": "test_128", 
    "Countdown-3": "countdown_3num_128", 
    "Countdown-5": "countdown_5num_128",
    "KnK": "knk"
}

sizes = ["0.5B", "1.5B"]

def parse_results_from_json(file):
    try:
        with open(file, 'r') as f:
            data = json.load(f)
        # print("File:", file)
        # Extract the relevant information from the JSON data   
        if "knk" in file:
            return data["scores"]["2ppl"]    
        if "countdown" in file or "test" in file:
            # hyperparams = data[0]['hyperparams']
            return data[1]['mean']*100
    except Exception as e:
        print("Error reading file:", file)
        print("Error message:", e)
        return None

# initialize results to store for sizes, method_key, task_key
results = {size: {method_key: {task_key: None for task_key in tasks.keys()} for method_key in methods.keys()} for size in sizes}

for size_val in sizes:
    for method_key, method_val in methods.items():
        for task_key, task_val in tasks.items():
            folder_to_look_for = f"./{method_val}/{task_val}*.json".replace("x.5B", size_val) 
            specific_file = glob.glob(folder_to_look_for)
            if specific_file:
                results[size_val][method_key][task_key] = parse_results_from_json(specific_file[0])
            
# above works, put their filepath in a pandas df
                 
# put results in a pandas df
df = pd.DataFrame.from_dict({(i, j): results[i][j] 
                           for i in results.keys() 
                           for j in results[i].keys()},
                           orient='index')
# Tranpose
df = df.transpose()
# make the df float up to 2 decimals
df = df.round(2)
df.to_latex("results.tex", index=True, float_format="%.2f")
df

Unnamed: 0_level_0,0.5B,0.5B,0.5B,0.5B,0.5B,1.5B,1.5B,1.5B,1.5B,1.5B
Unnamed: 0_level_1,Baseline,OP,SoS,RSoS,Distill,Baseline,OP,SoS,RSoS,Distill
Countdown:,0.0,2.34,37.5,0.78,25.78,0.78,5.47,49.22,52.34,44.53
Countdown-3,0.0,1.56,33.59,0.0,42.97,3.12,4.69,57.81,62.5,71.09
Countdown-5,0.0,0.78,0.0,0.0,0.78,0.0,0.78,0.0,0.0,1.56
KnK,1.0,1.5,0.0,0.0,1.5,7.0,2.5,0.0,0.0,12.0


In [4]:
# plot the results of the Countdown column
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set the style for the plots
plt.style.use('ggplot')
sns.set_palette("colorblind")
plt.rcParams.update({'font.size': 14})

# Reshape the dataframe for easier plotting
df_melted = pd.melt(df.reset_index(), 
                    id_vars='index', 
                    value_vars=['Countdown:', 'Countdown-3', 'Countdown-5'],
                    var_name='Task', 
                    value_name='Accuracy')

# Add columns for Model Size and Method
df_melted[['Method', 'Size']] = pd.DataFrame(df_melted['index'].tolist(), index=df_melted.index)

# Create a figure for the bar plots
plt.figure(figsize=(15, 10))

# Plot bar chart for each model size
for i, size in enumerate(sizes):
    plt.subplot(1, 2, i+1)
    
    # Filter data for this size
    size_df = df_melted[df_melted['Size'] == size]
    
    # Create the bar plot
    sns.barplot(x='Task', y='Accuracy', hue='Method', data=size_df)
    
    plt.title(f'Countdown Task Performance - {size} Model')
    plt.xlabel('Task')
    plt.ylabel('Accuracy (%)')
    plt.ylim(0, 100)  # Set y-axis from 0 to 100 for percentage
    plt.xticks(rotation=45)
    plt.legend(title='Method')

plt.tight_layout()
plt.savefig('countdown_results_by_size.png', dpi=300, bbox_inches='tight')
plt.show()

# Create a comparison across all methods and tasks as a heatmap
plt.figure(figsize=(14, 10))

# Pivot the data for the heatmap
heatmap_data = df_melted.pivot_table(
    index=['Method', 'Size'], 
    columns='Task', 
    values='Accuracy'
)

# Plot the heatmap
sns.heatmap(heatmap_data, annot=True, cmap="YlGnBu", fmt='.1f', linewidths=.5, vmin=0, vmax=100)
plt.title('Performance Comparison Across Countdown Tasks')
plt.tight_layout()
plt.savefig('countdown_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

# Create a line plot to compare methods across tasks for each size
plt.figure(figsize=(15, 6))

for i, size in enumerate(sizes):
    plt.subplot(1, 2, i+1)
    
    # Filter data for this size
    size_df = df_melted[df_melted['Size'] == size]
    
    # Pivot data for line plot
    pivot_df = size_df.pivot(index='Method', columns='Task', values='Accuracy')
    
    # Plot
    sns.lineplot(data=pivot_df, markers=True, dashes=False, markersize=10)
    
    plt.title(f'Method Comparison - {size} Model')
    plt.ylabel('Accuracy (%)')
    plt.ylim(0, 100)
    plt.grid(True)
    plt.legend(title='Task')

plt.tight_layout()
plt.savefig('countdown_method_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

KeyError: "The following id_vars or value_vars are not present in the DataFrame: ['index', 'Countdown:', 'Countdown-3', 'Countdown-5']"

In [None]:
sizes = ["1.5B"]

methods = {
    "No rejection \nsampling": "qwen-2.5-x.5B-instruct-sft-lora-countdown-search-react-seq10k-5k",
    "With rejection \nsampling": "qwen-2.5-x.5B-instruct-sft-lora-countdown-search-react-correct-seq10k-5k"
}

tasks = {
    "Countdown:": "test_128", 
}

results = {size: {method_key: {task_key: None for task_key in tasks.keys()} for method_key in methods.keys()} for size in sizes}
for size_val in sizes:
    for method_key, method_val in methods.items():
        for task_key, task_val in tasks.items():
            folder_to_look_for = f"./{method_val}/{task_val}*.json".replace("x.5B", size_val) 
            specific_file = glob.glob(folder_to_look_for)
            if specific_file:
                results[size_val][method_key][task_key] = parse_results_from_json(specific_file[0])
            
# above works, put their filepath in a pandas df
                
# put results in a pandas df
df = pd.DataFrame.from_dict({(i, j): results[i][j] 
                        for i in results.keys() 
                        for j in results[i].keys()},
                        orient='index')
# Tranpose
df = df.transpose()
# make the df float up to 2 decimals
df = df.round(2)
df.to_latex("results.tex", index=True, float_format="%.2f")
df

Unnamed: 0_level_0,1.5B,1.5B
Unnamed: 0_level_1,No rejection \nsampling,With rejection \nsampling
Countdown:,,0.52


In [None]:
training_sizes = ["1k", "5k"]

methods = {
    "SoS": "qwen-2.5-1.5B-instruct-sft-lora-countdown-search",
    "Distill": "qwen-2.5-1.5B-instruct-sft-lora-countdown-deepseek"
}

tasks = {
    "Countdown:": "test_128", 
}

results = {size: {method_key: {task_key: None for task_key in tasks.keys()} for method_key in methods.keys()} for size in training_sizes}
for size_val in training_sizes:
    for method_key, method_val in methods.items():
        for task_key, task_val in tasks.items():
            folder_to_look_for = f"./{method_val}*{size_val}/{task_val}*.json"
            specific_file = glob.glob(folder_to_look_for)
            if specific_file:
                results[size_val][method_key][task_key] = parse_results_from_json(specific_file[0])
            
# above works, put their filepath in a pandas df
                
# put results in a pandas df
df = pd.DataFrame.from_dict({(i, j): results[i][j] 
                        for i in results.keys() 
                        for j in results[i].keys()},
                        orient='index')
# Tranpose
df = df.transpose()
# make the df float up to 2 decimals
df = df.round(2)
df.to_latex("results.tex", index=True, float_format="%.2f")
df

Unnamed: 0_level_0,1k,1k,5k,5k
Unnamed: 0_level_1,SoS,Distill,SoS,Distill
Countdown:,0.33,0.31,0.52,0.21
