In [None]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.16.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.40-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.39.1-py2.py3-none-any.whl (254 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.1/254.1 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wan

In [None]:
from google.colab import drive

GDRIVE_BASE_PATH = "/content/drive/MyDrive/Can Language Models Follow Discussions?/probing_results/"
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import wandb
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from tqdm.auto import tqdm

# Authenticate with W&B

In [None]:
WANDB_API_KEY = ""
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
WANDB_USER_NAME = "digwit"
TIMEOUT = 60  # Increase the API timeout to 60 seconds
#wandb.login()
api = wandb.Api(timeout=TIMEOUT)

# CONSTANTS

In [None]:
PROJECT_NAMES = ["1_4_light_probes", "4_different_depths", "3_reactiveness", "5_claim_path_density",
                 "1_opposing_affirming_claims", "2_Sequential-Claims-Hard"]
METRICS = ["full test f1", "full test acc"]
CONTROL_TASKS = ["RANDOMIZATION", "NONE", "PERMUTATION"]
MODELS = ['microsoft/deberta-v3-base', 'facebook/bart-base', 'gpt2', 'albert-base-v2', 'bert-base-uncased']
SEEDS = [0, 1, 2, 3, 4]
FOLDS = [0, 1, 2, 3]

# Fetch Wandb Data

## Discussion Topic Included

In [None]:
SAVING_PATH = 'probing_results/wandb_data_2.csv'

In [None]:
def fetch_initial_data(api, user_name, project_names, metrics, control_tasks):
    data = []
    for project_name in project_names:
        runs = api.runs(f"{user_name}/{project_name}")
        for run in runs:
            if run.state == "finished":
                run_config = run.config
                model_name = run_config.get("model_name", "Unknown")
                seed = run_config.get("seed", -1)
                fold = run_config.get("fold", -1)
                control_task_type = run_config.get("control_task_type", "Unknown")

                for metric in metrics:
                    if metric in run.history().columns:
                        metric_value = run.history()[metric].mean()  # Assuming mean for aggregation
                        data.append({
                            "project": project_name,
                            "model": model_name,
                            "seed": seed,
                            "fold": fold,
                            "control_task": control_task_type,
                            "metric": metric,
                            "value": metric_value
                        })
    return data


# Fetch data
initial_data = fetch_initial_data(api, WANDB_USER_NAME, PROJECT_NAMES, METRICS, CONTROL_TASKS)

# Convert to DataFrame and save
initial_df = pd.DataFrame(initial_data)
initial_df.to_csv(f'{GDRIVE_BASE_PATH}/{SAVING_PATH}', index=False)

print("Initial data saved.")

In [None]:
initial_df.head()

Unnamed: 0,project,model,seed,fold,control_task,metric,value
0,1_4_light_probes,microsoft/deberta-v3-base,2,2,NONE,full test f1,0.793477
1,1_4_light_probes,microsoft/deberta-v3-base,2,2,NONE,full test acc,0.797944
2,1_4_light_probes,microsoft/deberta-v3-base,4,3,NONE,full test f1,0.790188
3,1_4_light_probes,microsoft/deberta-v3-base,4,3,NONE,full test acc,0.79455
4,1_4_light_probes,microsoft/deberta-v3-base,2,3,NONE,full test f1,0.790557


## Without Discussion Topic

## v1

In [None]:
def fetch_and_append_data(api, user_name, probe_numbers, metrics, control_tasks, hours_back):
    #current_time = datetime.now()
    #start_time = current_time - timedelta(hours=hours_back)
    #start_timestamp = start_time.timestamp()

    if os.path.exists(NO_CONTEXT_FILE_PATH):
        existing_df = pd.read_csv(NO_CONTEXT_FILE_PATH)
    else:
        print("Creating new dataframe")
        existing_df = pd.DataFrame()

    for probe_number in probe_numbers:
        new_data = []
        project_name = f"probe_{probe_number}_no_context"
        runs = api.runs(f"{user_name}/{project_name}")

        for run in tqdm(runs, desc=f"Processing {project_name}"):
            #created_at = run.created_at.timestamp()
            if run.state == "finished":  #and created_at >= start_timestamp:
                run_config = run.config
                model_name = run_config.get("model_name", "Unknown")
                seed = run_config.get("seed", -1)
                fold = run_config.get("fold", -1)
                control_task_type = run_config.get("control_task_type", "Unknown")

                for metric in metrics:
                    if metric in run.history().columns:
                        metric_value = run.history()[metric].mean()
                        new_data.append({
                            "project": project_name,
                            "model": model_name,
                            "seed": seed,
                            "fold": fold,
                            "control_task": control_task_type,
                            "metric": metric,
                            "value": metric_value,
                            "no_context": True
                        })

        new_df = pd.DataFrame(new_data)

        # Append new data to existing DataFrame and remove duplicates
        updated_df = pd.concat([existing_df, new_df]).drop_duplicates().reset_index(drop=True)
        updated_df.to_csv(NO_CONTEXT_FILE_PATH, index=False)

        print(f"Data updated and saved for {project_name}.")

In [None]:
hours_back = 480  # Fetch runs from only the last 24 hours
PROBE_NUMBERS = ["1", "2", "3", "4", "5"]
CONTROL_TASKS = ["NONE"]
NO_CONTEXT_FILE_PATH = f'{GDRIVE_BASE_PATH}/wandb_no_context_runs.csv'
fetch_and_append_data(api, WANDB_USER_NAME, PROBE_NUMBERS, METRICS, CONTROL_TASKS, hours_back)

Processing probe_3_no_context:   0%|          | 0/502 [00:00<?, ?it/s]

Data updated and saved for probe_3_no_context.


## v0

In [None]:
NO_CONTEXT_FILE_PATH_0 = 'probing_results/wandb_data_no_context.csv'

In [None]:
def fetch_filtered_data(api, user_name, project_names, metrics, control_tasks, start_date):
    data = []
    start_timestamp = datetime.strptime(start_date, '%Y-%m-%d').timestamp()

    for project_name in project_names:
        runs = api.runs(f"{user_name}/{project_name}")
        for run in runs:
            created_at = run.created_at.timestamp()  # WandB run creation timestamp
            if run.state == "finished" and created_at >= start_timestamp:
                run_config = run.config
                model_name = run_config.get("model_name", "Unknown")
                seed = run_config.get("seed", -1)
                fold = run_config.get("fold", -1)
                control_task_type = run_config.get("control_task_type", "Unknown")

                for metric in metrics:
                    if metric in run.history().columns:
                        metric_value = run.history()[metric].mean()  # Assuming mean for aggregation
                        data.append({
                            "project": project_name,
                            "model": model_name,
                            "seed": seed,
                            "fold": fold,
                            "control_task": control_task_type,
                            "metric": metric,
                            "value": metric_value
                        })
    return data


In [None]:
start_date = "2023-12-14"
filtered_data = fetch_filtered_data(api, WANDB_USER_NAME, PROJECT_NAMES, METRICS, CONTROL_TASKS, start_date)

filtered_df = pd.DataFrame(filtered_data)
filtered_df.to_csv(f'{GDRIVE_BASE_PATH}/{NO_CONTEXT_FILE_PATH_0}', index=False)

print("Filtered data saved.")

 # Check for missing runs / parameter configurations

In [None]:
#todo refactor
def check_missing_data(data):
    missing_data = []
    for project in PROJECT_NAMES:
        for model in data['model'].unique():
            for seed in range(5):  # Seeds 0 to 4
                for fold in range(4):  # Folds 0 to 3
                    for control_task in CONTROL_TASKS:
                        condition = (data['project'] == project) & (data['model'] == model) & (data['seed'] == seed) & (
                                data['fold'] == fold) & (data['control_task'] == control_task)
                        if not data[condition].shape[0] == len(METRICS):
                            missing_data.append((project, model, seed, fold, control_task))
    return missing_data


# Check for missing combinations in each project
missing_combinations = {}
total_missing_count = 0

for project in PROJECT_NAMES:
    missing = []
    for model in MODELS:
        for seed in SEEDS:
            for fold in FOLDS:
                for task in CONTROL_TASKS:
                    if not ((initial_df['project'] == project) & (initial_df['model'] == model) &
                            (initial_df['seed'] == seed) & (initial_df['fold'] == fold) &
                            (initial_df['control_task'] == task)).any():
                        missing.append((seed, model, fold, task.lower()))
    missing_combinations[project] = missing
    project_missing_count = len(missing)
    total_missing_count += project_missing_count
    print(f"Project '{project}' is missing {project_missing_count} combinations.")



In [None]:
missing_combinations['4_different_depths']

In [None]:
missing_combinations['5_claim_path_density']

In [None]:
missing_combinations['3_reactiveness']

## Generate a single chained command to rerun missing runs in probing framework


In [None]:
all_commands = []
for project, missing in missing_combinations.items():
    for seed, model, fold, task in missing:
        cmd = f"python3 run_tasks.py --seeds {seed} --models {model} --control_tasks {task} --include_probes {project.split('_')[0]} --attach_topic True"
        all_commands.append(cmd)

chained_command = " && ".join(all_commands)
print(f"All projects combined are missing a total of {total_missing_count} combinations.")
print(f"Chained command to rerun all missing combinations:\n{chained_command}")


All projects combined are missing a total of 167 combinations.
Chained command to rerun all missing combinations:
python3 run_tasks.py --seeds 1 --models facebook/bart-base --control_tasks RANDOMIZATION --include_probes 4 --attach_topic True && python3 run_tasks.py --seeds 3 --models facebook/bart-base --control_tasks RANDOMIZATION --include_probes 4 --attach_topic True && python3 run_tasks.py --seeds 4 --models facebook/bart-base --control_tasks RANDOMIZATION --include_probes 4 --attach_topic True && python3 run_tasks.py --seeds 4 --models facebook/bart-base --control_tasks RANDOMIZATION --include_probes 4 --attach_topic True && python3 run_tasks.py --seeds 3 --models bert-base-uncased --control_tasks RANDOMIZATION --include_probes 4 --attach_topic True && python3 run_tasks.py --seeds 0 --models microsoft/deberta-v3-base --control_tasks RANDOMIZATION --include_probes 3 --attach_topic True && python3 run_tasks.py --seeds 0 --models microsoft/deberta-v3-base --control_tasks NONE --inclu

# TODO: TEST/REMOVE Append Data to Analysis Dataframe

In [None]:
# already renamed for test  case
saving_path_aggregated = "probing_results/updated_aggregated_mann_whitney_u_test_results_2.csv"
saving_path_runs = "probing_results/all_run_level_mann_whitney_u_test_results_2.csv"

In [None]:
def append_to_csv(file_path, new_data):
    """
    Appends new data to an existing CSV file.

    :param file_path: String, the path to the CSV file.
    :param new_data: DataFrame, the new data to append.
    """
    # Load existing data from CSV
    try:
        existing_data = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"No existing file found at {file_path}. A new file will be created.")
        existing_data = pd.DataFrame()

    # Append new data
    updated_data = pd.concat([existing_data, new_data], ignore_index=True)

    # Save updated data to CSV
    updated_data.to_csv(file_path, index=False)
    print(f"Data successfully appended to {file_path}.")


# File paths for existing CSV files


aggregated_csv_path = f'{GDRIVE_BASE_PATH}/{saving_path_aggregated}'
run_level_csv_path = f'{GDRIVE_BASE_PATH}/{saving_path_runs}'

# Example usage:
# Assuming 'new_aggregated_data' and 'new_run_level_data' are the new DataFrames to append
append_to_csv(aggregated_csv_path, new_aggregated_data)
append_to_csv(run_level_csv_path, new_run_level_data)


# Save Dataframe

In [None]:
#initial_df = pd.read_csv(f'{GDRIVE_BASE_PATH}/{SAVING_PATH}')

In [None]:
initial_df.head()

Unnamed: 0,project,model,seed,fold,control_task,metric,value
0,1_4_light_probes,microsoft/deberta-v3-base,2,2,NONE,full test f1,0.793477
1,1_4_light_probes,microsoft/deberta-v3-base,2,2,NONE,full test acc,0.797944
2,1_4_light_probes,microsoft/deberta-v3-base,4,3,NONE,full test f1,0.790188
3,1_4_light_probes,microsoft/deberta-v3-base,4,3,NONE,full test acc,0.79455
4,1_4_light_probes,microsoft/deberta-v3-base,2,3,NONE,full test f1,0.790557


In [None]:
SAVING_PATH_2 = 'probing_results/wandb_data_2.csv'