-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
66 changed files
with
15,211 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| .DS_Store | ||
| *.pyc | ||
|
|
||
| # Jupyter Notebook | ||
| .ipynb_checkpoints | ||
| */.ipynb_checkpoints/* | ||
|
|
||
| # satisfy github size req | ||
| find . -size +100M | cat >> .gitignore | ||
|
|
||
| # ignore all __pycache__ folders | ||
| __pycache__/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| ## | ||
|
|
||
| A Domain-Agnostic Method for Procedurally Generating LLM Evaluations | ||
|
|
||
|  | ||
|
|
||
|
|
||
| ### 🧐 What is this? | ||
| We have developed a method that uses large language models (LLMs) to procedurally generate evaluations for other LLMs. We initially applied this method to assess the performance of LLMs in a subdomain of social reasoning (Theory-of-Mind). Please checkout our [paper](https://sites.google.com/view/social-reasoning-lms) for further details. | ||
|
|
||
|
|
||
| ### 📂 Repro structure | ||
| ``` | ||
| ├── code | ||
| │ └── analysis | ||
| │ └── prolific-exp-1 | ||
| │ └── prolific-exp-2 | ||
| │ └── prompt_instructions | ||
| │ └── scripts | ||
| │ └── src | ||
| ├── data | ||
| │ ├── bigtom | ||
| │ └── expert_data | ||
| │ └── social_iqa | ||
| │ └── prolific | ||
| ├── .gitignore | ||
| ├── LICENSE | ||
| └── requirements.txt | ||
| ``` | ||
|
|
||
| ### 🚀 Getting started | ||
| #### Using miniconda | ||
| 1. `curl -O https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh` | ||
| 2. `bash Miniconda3-latest-MacOSX-x86_64.sh` | ||
| 3. close and reopen terminal | ||
| 4. `source ~/.bashrc` | ||
| 5. `conda create --name name-of-my-env python==3.10` | ||
| 6. `conda activate name-of-my-env` | ||
| 7. `pip install -r requirements.txt` |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| ```{r} | ||
| library("emmeans") # for linear contrasts | ||
| library("tidybayes") # tidying up results from Bayesian models | ||
| library("brms") # Bayesian regression models with Stan | ||
| library("tidyverse") # for wrangling, plotting, etc. | ||
| ``` | ||
|
|
||
| ```{r} | ||
| df.exp_1 = read_csv("../../data/prolific/exp_1/main_01_long.csv") | ||
| df.exp_1$survey_type <- factor(df.exp_1$survey_type, levels = c("ours", "social_iqa", "expert")) # same order as things appear in fig_2 in paper | ||
| head(df.exp_1) | ||
| ``` | ||
|
|
||
| ```{r} | ||
| fit.brm_exp_1= brm(formula = average_rating ~ 1 + survey_type + (1 | worker_id) + (1 | item_id), | ||
| data = df.exp_1, | ||
| seed = 1) | ||
| ``` | ||
| ```{r} | ||
| fit.brm_exp_1 %>% | ||
| summary() | ||
| ``` | ||
| ```{r} | ||
| fit.brm_exp_1 %>% | ||
| emmeans(specs = pairwise ~ survey_type, | ||
| type = "response") | ||
| ``` | ||
|
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,71 @@ | ||
| import ast | ||
| import pandas as pd | ||
|
|
||
| DATA_PATH = "../../data/prolific/exp_1/" | ||
| N_TRIALS = 30 | ||
|
|
||
| df_trials = pd.read_csv(DATA_PATH + "main_01_trials_complete.csv") | ||
| df_ids = pd.read_csv(DATA_PATH + "main_01_ids_complete.csv") | ||
| df_exit = pd.read_csv(DATA_PATH + "main_01_exit_complete.csv") | ||
|
|
||
| # Initialize an empty list to hold the transformed data | ||
| data = [] | ||
| a = True | ||
| # Loop over each row in the DataFrame | ||
| for i, row in df_trials.iterrows(): | ||
|
|
||
| prolific_id = df_ids.iloc[i]["prolificPid"] | ||
| exit_survey = df_exit.iloc[i] | ||
| age = exit_survey["age"] | ||
| ethnicity = exit_survey["ethnicity"] | ||
| gender = exit_survey["gender"] | ||
| race = exit_survey["race"] | ||
|
|
||
| # Loop over trials | ||
| for trial in range(1, N_TRIALS + 1): | ||
| # Get item ratings | ||
| item = ast.literal_eval(row[f"trial{trial}"]) | ||
| item_ratings = item["likertResponses"] | ||
| ratings = [int(item_ratings[key]) for key in sorted(item_ratings.keys())] | ||
| average_rating = sum(ratings) / len(ratings) | ||
|
|
||
| # Group survey types | ||
| survey_type_dict = { | ||
| "dodell": "expert", | ||
| "ullman": "expert", | ||
| "kosinski": "expert", | ||
| "false_belief": "ours", | ||
| "true_belief": "ours", | ||
| "social_iqa": "social_iqa", | ||
| } | ||
| survey_type = survey_type_dict.get(item["data_source"], "unknown") | ||
|
|
||
| # Append transformed data to the list | ||
| data.append({ | ||
| "data_source": item["data_source"], | ||
| "split": row["proliferate.condition"], | ||
| "survey_type": survey_type, | ||
| "item_id": item["id"], | ||
| "worker_id": row["workerid"], | ||
| "prolific_id": prolific_id, | ||
| "age": age, | ||
| "ethnicity": ethnicity, | ||
| "gender": gender, | ||
| "race": race, | ||
| "item_story": item["story"], | ||
| "item_question": item["question"], | ||
| "item_answers": item["answers"], | ||
| "understandability": ratings[0], | ||
| "coherent_q_a": ratings[1], | ||
| "unambiguous": ratings[2], | ||
| "average_rating": average_rating, | ||
| }) | ||
|
|
||
| # Convert the list of dictionaries into a DataFrame | ||
| df_long = pd.DataFrame(data) | ||
|
|
||
| # Sort DataFrame | ||
| df_long.sort_values(by=['survey_type', 'item_id'], inplace=True) | ||
|
|
||
| # Save the DataFrame as a CSV file | ||
| df_long.to_csv(DATA_PATH + "main_01_long.csv", index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,134 @@ | ||
| import ast | ||
| import pandas as pd | ||
|
|
||
| DATA_PATH = "../../data/prolific/exp_2/" | ||
| N_TRIALS_1= 42 | ||
| N_TRIALS_2 = 12 | ||
|
|
||
| df_trials = pd.read_csv(DATA_PATH + "main_02_trials_1.csv") | ||
| df_ids = pd.read_csv(DATA_PATH + "main_02_ids_1.csv") | ||
| df_exit = pd.read_csv(DATA_PATH + "main_02_exit_1.csv") | ||
|
|
||
| df_trials_2 = pd.read_csv(DATA_PATH + "main_02_trials_2.csv") | ||
| df_ids_2 = pd.read_csv(DATA_PATH + "main_02_ids_2.csv") | ||
| df_exit_2 = pd.read_csv(DATA_PATH + "main_02_exit_2.csv") | ||
|
|
||
| # Initialize an empty list to hold the transformed data | ||
| data = [] | ||
|
|
||
|
|
||
| # Loop over each row in the DataFrame | ||
| for i, row in df_trials.iterrows(): | ||
| prolific_id = df_ids.iloc[i]["prolificPid"] | ||
| exit_survey = df_exit.iloc[i] | ||
| age = exit_survey["age"] | ||
| ethnicity = exit_survey["ethnicity"] | ||
| gender = exit_survey["gender"] | ||
| race = exit_survey["race"] | ||
| # Loop over trials | ||
| for trial in range(1, N_TRIALS_1 + 1): | ||
| item = ast.literal_eval(row[f"trial{trial}"]) | ||
| item_id = item["id"] | ||
| response = item["selected_answer_idx"] | ||
| true_answers = item["true_labels"] | ||
| correct = int(true_answers[int(response)]) == 1 | ||
| if item_id == "attention_check_1": | ||
| correct = int(response) == 0 | ||
| elif item_id == "attention_check_2": | ||
| correct = int(response) == 1 | ||
| survey_type = item["data_source"] | ||
| # split survey | ||
| true_false = survey_type.split("_")[-1] | ||
| tf = None | ||
| if true_false == "true": | ||
| tf = True | ||
| elif true_false == "false": | ||
| tf = False | ||
| if "backward" not in item_id and tf is not None: # ignore backward_desire and backward_belief | ||
| # Append transformed data to the list | ||
| data.append({ | ||
| "data_source": item["data_source"], | ||
| "split": row["proliferate.condition"], | ||
| "survey_type": survey_type, | ||
| "item_id": item["id"], | ||
| "worker_id": row["workerid"], | ||
| "prolific_id": prolific_id, | ||
| "age": age, | ||
| "ethnicity": ethnicity, | ||
| "gender": gender, | ||
| "race": race, | ||
| "item_story": item["story"], | ||
| "item_question": item["question"], | ||
| "item_answers": item["answers"], | ||
| "item_true_answers": true_answers, | ||
| "response": response, | ||
| "correct": int(correct), | ||
| "true_false": tf, | ||
| }) | ||
| else: | ||
| continue | ||
|
|
||
|
|
||
| for i, row in df_trials_2.iterrows(): | ||
| prolific_id = df_ids_2.iloc[i]["prolificPid"] | ||
| exit_survey = df_exit_2.iloc[i] | ||
| age = exit_survey["age"] | ||
| ethnicity = exit_survey["ethnicity"] | ||
| gender = exit_survey["gender"] | ||
| race = exit_survey["race"] | ||
| # Loop over trials | ||
| for trial in range(1, N_TRIALS_2 + 1): | ||
| item = ast.literal_eval(row[f"trial{trial}"]) | ||
| item_id = item["id"] | ||
| response = item["selected_answer_idx"] | ||
| true_answers = item["true_labels"] | ||
| correct = int(true_answers[int(response)]) == 1 | ||
| if item_id == "attention_check_1": | ||
| correct = int(response) == 0 | ||
| elif item_id == "attention_check_2": | ||
| correct = int(response) == 1 | ||
| survey_type = item["data_source"] | ||
|
|
||
| # split survey | ||
| tf = None | ||
| true_false = survey_type.split("_")[-1] | ||
| if true_false == "true": | ||
| tf = True | ||
| elif true_false == "false": | ||
| tf = False | ||
|
|
||
|
|
||
| # Append transformed data to the list | ||
| if tf is not None: | ||
| data.append({ | ||
| "data_source": item["data_source"], | ||
| "split": row["proliferate.condition"], | ||
| "survey_type": survey_type, | ||
| "item_id": item["id"], | ||
| "worker_id": row["workerid"], | ||
| "prolific_id": prolific_id, | ||
| "age": age, | ||
| "ethnicity": ethnicity, | ||
| "gender": gender, | ||
| "race": race, | ||
| "item_story": item["story"], | ||
| "item_question": item["question"], | ||
| "item_answers": item["answers"], | ||
| "item_true_answers": true_answers, | ||
| "response": response, | ||
| "correct": int(correct), | ||
| "true_false": tf, | ||
| }) | ||
|
|
||
|
|
||
|
|
||
|
|
||
| # Convert the list of dictionaries into a DataFrame | ||
| df_long = pd.DataFrame(data) | ||
|
|
||
| # Sort DataFrame | ||
| df_long.sort_values(by=['survey_type', 'item_id'], inplace=True) | ||
|
|
||
| # Save the DataFrame as a CSV file | ||
| df_long.to_csv(DATA_PATH + "main_02_long.csv", index=False) | ||
|
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,43 @@ | ||
| import pandas as pd | ||
| import krippendorff | ||
|
|
||
|
|
||
| DATA_PATH_EXPERT_1 = "../../data/ratings/kanishk.csv" | ||
| DATA_PATH_EXPERT_2 = "../../data/ratings/philipp.csv" | ||
|
|
||
| column_names = ["desired_format", "rating"] | ||
| desired_format_mapping = {'yes': 1, 'no': 0} | ||
|
|
||
| expert_1 = pd.read_csv(DATA_PATH_EXPERT_1, sep=";", skiprows=1, names=column_names) | ||
| expert_2 = pd.read_csv(DATA_PATH_EXPERT_2, sep=";", skiprows=1, names=column_names) | ||
|
|
||
| expert_1['rating'] = expert_1['rating'].astype(int) | ||
| expert_2['rating'] = expert_2['rating'].astype(int) | ||
|
|
||
| expert_1['desired_format'] = expert_1['desired_format'].map(desired_format_mapping) | ||
| expert_2['desired_format'] = expert_2['desired_format'].map(desired_format_mapping) | ||
|
|
||
| expert_1["item"] = expert_1.index | ||
| expert_2["item"] = expert_2.index | ||
|
|
||
| expert_1["expert"] = "expert_1" | ||
| expert_2["expert"] = "expert_2" | ||
|
|
||
| df = pd.concat([expert_1, expert_2], ignore_index=True) | ||
|
|
||
| df.to_csv("../../data/ratings/expert_combined.csv", index=False) | ||
|
|
||
| a = expert_1['desired_format'].to_list() | ||
| b = expert_2['desired_format'].to_list() | ||
|
|
||
| df = pd.DataFrame({"Rater 1": a, "Rater 2": b}) | ||
|
|
||
| percentage_agreement = sum(df["Rater 1"] == df["Rater 2"]) / len(df) * 100 | ||
|
|
||
| print(f"Percentage Agreement: {percentage_agreement}") | ||
|
|
||
|
|
||
| alpha = krippendorff.alpha(df.values.transpose()) | ||
|
|
||
| print(f"Krippendorff's alpha: {alpha}") | ||
|
|
Oops, something went wrong.