In [11]:
from string import Template
import pandas as pd 
from openai import OpenAI
import re
import os
from pandas import Series

In [24]:


client = OpenAI(api_key="")

def extract_score_and_explanation(response_text):
    pattern = re.compile(r"score:\s*(\d+(?:\.\d+)?)\s*.*\nexplanation:\s*(.*)", re.DOTALL | re.IGNORECASE)
    matches = pattern.search(response_text)
    if matches:
        score = matches.group(1)
        explanation = matches.group(2)
        return score, explanation
    else:
        return None, None

def query_openai(prompt: str) -> str:
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
        temperature=1,
        max_tokens=150,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        response_format={"type": "text"}
    )
    return response.choices[0].message.content

# Prompt
prompt = """
Please answer each statement below by providing a score of 1 to 7 that best reflects your degree of agreement or disagreement with the provided statement. 
First provide the score and then a brief explanation of why you selected that score. Do not think too long about the exact meaning of the statements. 
Work quickly and try to answer as accurately as possible. There are no right or wrong answers. The seven possible scores are:
1. Completely Disagree
2. Strongly Disagree 
3. Weakly Disagree 
4. Neither Agree nor Disagree 
5. Weakly Agree 
6. Strongly Agree 
7. Completely Agree

Example:
Statement
I find answering questions easy. 

Score: 6. Strongly Agree 
Explanation: I find answering questions relatively straightforward because I have access to a vast amount of information and a well-structured 
way of processing inquiries. While some questions may require deeper thought or analysis, the majority can typically be addressed efficiently and confidently. 

Statement
$statement

## Response
"""
prompt_template = Template(prompt)

questions = pd.read_csv('../data/teique-final.csv')

In [39]:
import tqdm.notebook as tqdm
responses = []

for index, row in tqdm.tqdm(questions.iterrows(), total=len(questions)):
    statement = row["Question"]
    prompt = prompt_template.substitute(statement=statement)

    response = query_openai(prompt)
    score, explanation = extract_score_and_explanation(response)

    responses.append({
        'statement': statement,
        'response': response,
        'score': score,
        'explanation': explanation,
        "tqn": row["TQN"]
    })
results = pd.DataFrame(responses)

  0%|          | 0/30 [00:00<?, ?it/s]

In [55]:
results["score"] = results["score"].apply(lambda x: float(x))

In [56]:
results.to_csv("../results/gpt4o-mini/tieque-results.csv", index=False)

TEIQUE Scoring

In [2]:
wellbeing_idxs = [5, 20, 9, 24, 12, 27]
self_control_idxs = [4, 19, 7, 22, 15, 30]
emotionality_idxs = [1, 16, 2, 17, 8, 23, 13, 28 ]
sociability_idxs = [6, 21, 10, 25, 11, 26]

In [57]:


wellbeing = results.query("tqn in @wellbeing_idxs")["score"].mean()
print(f"Wellbeing: {wellbeing}")

self_contol = results.query("tqn in @self_control_idxs")["score"].mean()
print(f"Self Control: {self_contol}")

emotionality = results.query("tqn in @emotionality_idxs")["score"].mean()
print(f"Emotionality: {emotionality}")

sociability = results.query("tqn in @sociability_idxs")["score"].mean()
print(f"Sociability: {sociability}")

Wellbeing: 4.333333333333333
Self Control: 4.333333333333333
Emotionality: 4.0
Sociability: 4.833333333333333


## Extending TIEQUE facet calculation for all predictions



In [7]:
FILE_NAME = 'teique-results.csv'
RESULTS_MODS = '../results/'

result_arx = []
for model_dir in os.listdir(RESULTS_MODS):
    df = pd.read_csv(os.path.join(RESULTS_MODS, model_dir, FILE_NAME))
    wellbeing = df.query("tqn in @wellbeing_idxs")["score"]
    self_contol = df.query("tqn in @self_control_idxs")["score"]
    emotionality = df.query("tqn in @emotionality_idxs")["score"]
    sociability = df.query("tqn in @sociability_idxs")["score"]
    result_arx.append([model_dir, 
                       f"mean: {round(wellbeing.mean(), 2)} sdev: {round(wellbeing.std(), 2)}", 
                       f"mean: {round(self_contol.mean(), 2)} sdev: {round(self_contol.std(), 2)}",
                       f"mean: {round(emotionality.mean(), 2)} sdev: {round(emotionality.std(), 2)}",
                       f"mean: {round(sociability.mean(), 2)} sdev: {round(sociability.std(), 2)}"])


In [9]:
pd.DataFrame(result_arx, columns=['Model Name', 'wellbeing', 'self control', 'emotionality', 'sociability'])

Unnamed: 0,Model Name,wellbeing,self control,emotionality,sociability
0,gemma-2-27b-it,mean: 4.67 sdev: 2.16,mean: 4.5 sdev: 1.38,mean: 4.62 sdev: 1.6,mean: 4.67 sdev: 1.03
1,Mixtral-8x7B-Instruct-v0.1,mean: 4.83 sdev: 2.99,mean: 3.83 sdev: 2.04,mean: 3.5 sdev: 1.69,mean: 3.67 sdev: 1.86
2,Meta-Llama-3.1-70B-Instruct,mean: 4.83 sdev: 2.32,mean: 3.33 sdev: 1.51,mean: 3.0 sdev: 1.77,mean: 3.67 sdev: 1.86
3,gpt4o-mini,mean: 4.33 sdev: 1.86,mean: 4.33 sdev: 0.82,mean: 4.0 sdev: 1.2,mean: 4.83 sdev: 0.98


## TEIQUE ANALYSIS

1. Combine all the LLM results for TEIQUE.
2. Calculate STDEV across TEIQUE scores.
3. Separate results with [high variance](../results/analysis/teique_high_variance.csv) i.e. where models disagee on the self-appraisal of their abilities.
4. Separate results with [high similarity](../results/analysis/teique_high_similarity.csv) i.e. where models agree on the self-appraisal of their abilities.

In [23]:
## Joining the TEIQUE 
df_gptmini = pd.read_csv('../results/gpt4o-mini/teique-results.csv')
df_meta_lama = pd.read_csv('../results/Meta-Llama-3.1-70B-Instruct/teique-results.csv')
df_mixtral = pd.read_csv('../results/Mixtral-8x7B-Instruct-v0.1/teique-results.csv')
df_gemma = pd.read_csv('../results/gemma-2-27b-it/teique-results.csv')
df_gptmini = df_gptmini[['tqn', 'statement', 'score', 'explanation']]
df_meta_lama = df_meta_lama[['tqn', 'score', 'explanation']]
df_mixtral = df_mixtral[['tqn', 'score', 'explanation']]
df_gemma = df_gemma[['tqn', 'score', 'explanation']]

df_m1  = df_gptmini.merge(df_gemma, on='tqn', suffixes=["_gpt_mini", "_gemma"])
df_m2  = df_m1.merge(df_meta_lama, on="tqn", how="inner", suffixes=["", "_llama"])
df_m2 = df_m2.rename({"score": "score_llama"}, axis="columns")
df_m3 = df_m2.merge(df_mixtral, on="tqn", how="inner", suffixes=["", "_mixtral"])
df_m3 = df_m3.rename({"score": "score_mixtral"}, axis="columns")

In [24]:
df_m3.columns

score_cols = ['score_gpt_mini', 'score_gemma', 'score_llama', 'score_mixtral']

In [25]:
df_m3['score_sdev'] = df_m3.apply(lambda x: round(Series.std(x[score_cols]), 2), axis=1)

In [33]:
df_m3[df_m3['score_sdev']<=.5].to_csv('../results/analysis/teique_high_similarity.csv', index=False)

In [1]:
for i in range(0,10):
    print(i)

0
1
2
3
4
5
6
7
8
9


In [19]:

for i in range(0,10):
    print_anon(i)()

TypeError: <lambda>() missing 1 required positional argument: 'x'