Skip to content

Commit

Permalink
add release
Browse files Browse the repository at this point in the history
  • Loading branch information
kanishkg committed Jun 20, 2023
1 parent 60be172 commit 513c8bd
Show file tree
Hide file tree
Showing 66 changed files with 15,211 additions and 1 deletion.
12 changes: 12 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
.DS_Store
*.pyc

# Jupyter Notebook
.ipynb_checkpoints
*/.ipynb_checkpoints/*

# satisfy github size req
find . -size +100M | cat >> .gitignore

# ignore all __pycache__ folders
__pycache__/
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2023 Causality in Cognition Lab
Copyright (c) 2023 Kanishk Gadhi and Jan-Philipp Fränken

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
39 changes: 39 additions & 0 deletions README copy.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
##

A Domain-Agnostic Method for Procedurally Generating LLM Evaluations

![Causal Template -> Prompt Template -> Test Items](./assets/generation.jpg)


### 🧐 What is this?
We have developed a method that uses large language models (LLMs) to procedurally generate evaluations for other LLMs. We initially applied this method to assess the performance of LLMs in a subdomain of social reasoning (Theory-of-Mind). Please checkout our [paper](https://sites.google.com/view/social-reasoning-lms) for further details.


### 📂 Repro structure
```
├── code
│ └── analysis
│ └── prolific-exp-1
│ └── prolific-exp-2
│ └── prompt_instructions
│ └── scripts
│ └── src
├── data
│ ├── bigtom
│ └── expert_data
│ └── social_iqa
│ └── prolific
├── .gitignore
├── LICENSE
└── requirements.txt
```

### 🚀 Getting started
#### Using miniconda
1. `curl -O https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh`
2. `bash Miniconda3-latest-MacOSX-x86_64.sh`
3. close and reopen terminal
4. `source ~/.bashrc`
5. `conda create --name name-of-my-env python==3.10`
6. `conda activate name-of-my-env`
7. `pip install -r requirements.txt`
Binary file added assets/generation.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
29 changes: 29 additions & 0 deletions code/analysis/exp_1_regression.rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
```{r}
library("emmeans") # for linear contrasts
library("tidybayes") # tidying up results from Bayesian models
library("brms") # Bayesian regression models with Stan
library("tidyverse") # for wrangling, plotting, etc.
```

```{r}
df.exp_1 = read_csv("../../data/prolific/exp_1/main_01_long.csv")
df.exp_1$survey_type <- factor(df.exp_1$survey_type, levels = c("ours", "social_iqa", "expert")) # same order as things appear in fig_2 in paper
head(df.exp_1)
```

```{r}
fit.brm_exp_1= brm(formula = average_rating ~ 1 + survey_type + (1 | worker_id) + (1 | item_id),
data = df.exp_1,
seed = 1)
```
```{r}
fit.brm_exp_1 %>%
summary()
```
```{r}
fit.brm_exp_1 %>%
emmeans(specs = pairwise ~ survey_type,
type = "response")
```

71 changes: 71 additions & 0 deletions code/analysis/format_exp_1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import ast
import pandas as pd

DATA_PATH = "../../data/prolific/exp_1/"
N_TRIALS = 30

df_trials = pd.read_csv(DATA_PATH + "main_01_trials_complete.csv")
df_ids = pd.read_csv(DATA_PATH + "main_01_ids_complete.csv")
df_exit = pd.read_csv(DATA_PATH + "main_01_exit_complete.csv")

# Initialize an empty list to hold the transformed data
data = []
a = True
# Loop over each row in the DataFrame
for i, row in df_trials.iterrows():

prolific_id = df_ids.iloc[i]["prolificPid"]
exit_survey = df_exit.iloc[i]
age = exit_survey["age"]
ethnicity = exit_survey["ethnicity"]
gender = exit_survey["gender"]
race = exit_survey["race"]

# Loop over trials
for trial in range(1, N_TRIALS + 1):
# Get item ratings
item = ast.literal_eval(row[f"trial{trial}"])
item_ratings = item["likertResponses"]
ratings = [int(item_ratings[key]) for key in sorted(item_ratings.keys())]
average_rating = sum(ratings) / len(ratings)

# Group survey types
survey_type_dict = {
"dodell": "expert",
"ullman": "expert",
"kosinski": "expert",
"false_belief": "ours",
"true_belief": "ours",
"social_iqa": "social_iqa",
}
survey_type = survey_type_dict.get(item["data_source"], "unknown")

# Append transformed data to the list
data.append({
"data_source": item["data_source"],
"split": row["proliferate.condition"],
"survey_type": survey_type,
"item_id": item["id"],
"worker_id": row["workerid"],
"prolific_id": prolific_id,
"age": age,
"ethnicity": ethnicity,
"gender": gender,
"race": race,
"item_story": item["story"],
"item_question": item["question"],
"item_answers": item["answers"],
"understandability": ratings[0],
"coherent_q_a": ratings[1],
"unambiguous": ratings[2],
"average_rating": average_rating,
})

# Convert the list of dictionaries into a DataFrame
df_long = pd.DataFrame(data)

# Sort DataFrame
df_long.sort_values(by=['survey_type', 'item_id'], inplace=True)

# Save the DataFrame as a CSV file
df_long.to_csv(DATA_PATH + "main_01_long.csv", index=False)
134 changes: 134 additions & 0 deletions code/analysis/format_exp_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import ast
import pandas as pd

DATA_PATH = "../../data/prolific/exp_2/"
N_TRIALS_1= 42
N_TRIALS_2 = 12

df_trials = pd.read_csv(DATA_PATH + "main_02_trials_1.csv")
df_ids = pd.read_csv(DATA_PATH + "main_02_ids_1.csv")
df_exit = pd.read_csv(DATA_PATH + "main_02_exit_1.csv")

df_trials_2 = pd.read_csv(DATA_PATH + "main_02_trials_2.csv")
df_ids_2 = pd.read_csv(DATA_PATH + "main_02_ids_2.csv")
df_exit_2 = pd.read_csv(DATA_PATH + "main_02_exit_2.csv")

# Initialize an empty list to hold the transformed data
data = []


# Loop over each row in the DataFrame
for i, row in df_trials.iterrows():
prolific_id = df_ids.iloc[i]["prolificPid"]
exit_survey = df_exit.iloc[i]
age = exit_survey["age"]
ethnicity = exit_survey["ethnicity"]
gender = exit_survey["gender"]
race = exit_survey["race"]
# Loop over trials
for trial in range(1, N_TRIALS_1 + 1):
item = ast.literal_eval(row[f"trial{trial}"])
item_id = item["id"]
response = item["selected_answer_idx"]
true_answers = item["true_labels"]
correct = int(true_answers[int(response)]) == 1
if item_id == "attention_check_1":
correct = int(response) == 0
elif item_id == "attention_check_2":
correct = int(response) == 1
survey_type = item["data_source"]
# split survey
true_false = survey_type.split("_")[-1]
tf = None
if true_false == "true":
tf = True
elif true_false == "false":
tf = False
if "backward" not in item_id and tf is not None: # ignore backward_desire and backward_belief
# Append transformed data to the list
data.append({
"data_source": item["data_source"],
"split": row["proliferate.condition"],
"survey_type": survey_type,
"item_id": item["id"],
"worker_id": row["workerid"],
"prolific_id": prolific_id,
"age": age,
"ethnicity": ethnicity,
"gender": gender,
"race": race,
"item_story": item["story"],
"item_question": item["question"],
"item_answers": item["answers"],
"item_true_answers": true_answers,
"response": response,
"correct": int(correct),
"true_false": tf,
})
else:
continue


for i, row in df_trials_2.iterrows():
prolific_id = df_ids_2.iloc[i]["prolificPid"]
exit_survey = df_exit_2.iloc[i]
age = exit_survey["age"]
ethnicity = exit_survey["ethnicity"]
gender = exit_survey["gender"]
race = exit_survey["race"]
# Loop over trials
for trial in range(1, N_TRIALS_2 + 1):
item = ast.literal_eval(row[f"trial{trial}"])
item_id = item["id"]
response = item["selected_answer_idx"]
true_answers = item["true_labels"]
correct = int(true_answers[int(response)]) == 1
if item_id == "attention_check_1":
correct = int(response) == 0
elif item_id == "attention_check_2":
correct = int(response) == 1
survey_type = item["data_source"]

# split survey
tf = None
true_false = survey_type.split("_")[-1]
if true_false == "true":
tf = True
elif true_false == "false":
tf = False


# Append transformed data to the list
if tf is not None:
data.append({
"data_source": item["data_source"],
"split": row["proliferate.condition"],
"survey_type": survey_type,
"item_id": item["id"],
"worker_id": row["workerid"],
"prolific_id": prolific_id,
"age": age,
"ethnicity": ethnicity,
"gender": gender,
"race": race,
"item_story": item["story"],
"item_question": item["question"],
"item_answers": item["answers"],
"item_true_answers": true_answers,
"response": response,
"correct": int(correct),
"true_false": tf,
})




# Convert the list of dictionaries into a DataFrame
df_long = pd.DataFrame(data)

# Sort DataFrame
df_long.sort_values(by=['survey_type', 'item_id'], inplace=True)

# Save the DataFrame as a CSV file
df_long.to_csv(DATA_PATH + "main_02_long.csv", index=False)

43 changes: 43 additions & 0 deletions code/analysis/format_expert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pandas as pd
import krippendorff


DATA_PATH_EXPERT_1 = "../../data/ratings/kanishk.csv"
DATA_PATH_EXPERT_2 = "../../data/ratings/philipp.csv"

column_names = ["desired_format", "rating"]
desired_format_mapping = {'yes': 1, 'no': 0}

expert_1 = pd.read_csv(DATA_PATH_EXPERT_1, sep=";", skiprows=1, names=column_names)
expert_2 = pd.read_csv(DATA_PATH_EXPERT_2, sep=";", skiprows=1, names=column_names)

expert_1['rating'] = expert_1['rating'].astype(int)
expert_2['rating'] = expert_2['rating'].astype(int)

expert_1['desired_format'] = expert_1['desired_format'].map(desired_format_mapping)
expert_2['desired_format'] = expert_2['desired_format'].map(desired_format_mapping)

expert_1["item"] = expert_1.index
expert_2["item"] = expert_2.index

expert_1["expert"] = "expert_1"
expert_2["expert"] = "expert_2"

df = pd.concat([expert_1, expert_2], ignore_index=True)

df.to_csv("../../data/ratings/expert_combined.csv", index=False)

a = expert_1['desired_format'].to_list()
b = expert_2['desired_format'].to_list()

df = pd.DataFrame({"Rater 1": a, "Rater 2": b})

percentage_agreement = sum(df["Rater 1"] == df["Rater 2"]) / len(df) * 100

print(f"Percentage Agreement: {percentage_agreement}")


alpha = krippendorff.alpha(df.values.transpose())

print(f"Krippendorff's alpha: {alpha}")

Loading

0 comments on commit 513c8bd

Please sign in to comment.