#Combined Graph Dataframe Generator

This notebook creates code to generate the combined data set showing results for all 32 graphs.

###Imports

In [None]:
import ast
import pandas as pd
import sys
import typing_extensions as typing
import numpy as np
from tqdm.auto import tqdm
from google.colab import drive
from scipy.stats import f, t
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.stats import zscore
import statsmodels.api as sm
import statsmodels.formula.api as smf

###Define Data Analysis Functions

In [None]:
def get_df(model:str,temperature:int,maxout:int) -> pd.DataFrame:
  """
  Obtains the dataframe for a given model, temperature, and maximum number of
  output tokens.

  Args:

    model: model name
    temperature: temperature parameter
    maxout: maximum number of output tokens

  Returns:

    df: dataframe containing the results for the given model, temperature, and
    maximum number of output tokens
  """
  filename = (f'/data/path_length_rt_data/'
              f'results_model_{model}_temp{temperature}_maxout'
              f'{maxout}.csv')
  df = pd.read_csv(filename)
  return df



def get_hallucination_rate(model:str,temperature:int,maxout:int) -> int:
  """
  Obtains the hallucination rate for a given model, temperature, and maximum
  number of output tokens.

  Args:

    model: model name
    temperature: temperature parameter
    maxout: maximum number of output tokens

  Returns:

    rate: the hallucination rate
  """
  filename = (f'/data/semantic_graphs'
            f'graph_model_{model}_'
            f'temp{temperature}_maxoutput{maxout}.txt')

  f = open(filename, "r")
  formatted_graphs_string = f.read()
  f.close()
  formatted_graphs = ast.literal_eval(formatted_graphs_string)

  !gdown 1qP8e6g78x9UwV1BJZIr2Jr_FwkoPbRjB --quiet
  df = pd.read_csv('priming_data.csv', engine='python')
  words_df = df[df['type'] != 'nw']
  words_df = words_df[words_df['RT'] != '#NULL!']
  words_df['RT'] = [int(i) for i in words_df["RT"]]
  words_df['RT'] = zscore(words_df['RT'])
  words_df = words_df.reset_index(drop=True)
  unique_words = np.concatenate([np.unique(words_df['prime']),
                                np.unique(words_df['target'])])
  unique_words = [w.lower() for w in unique_words]

  # Convert unique_words to a set for O(1) lookups
  unique_words_set = set(unique_words)

  # Pre-calculate total number of triples
  total_triples = sum(len(l) for l in formatted_graphs)

  # Use a single loop for improved efficiency
  hallucinations = [
      triple[key]
      for l in formatted_graphs
      for triple in l
      for key in ('subject', 'target')
      if triple[key] not in unique_words_set
  ]

  # Calculate hallucination percentage
  rate = len(hallucinations) / total_triples
  return rate

def connected_graph(df:pd.DataFrame) -> bool:
  """
  Determines if a graph is connected.

  Args:

    df: dataframe containing the summary of graph data for a given semantic
    graph.

  Returns:

    True if the graph is connected, False otherwise
  """
  if np.nanmax(df['distance']) == float('inf'):
    return False
  else:
    return True

def max_path(df:pd.DataFrame) -> int:
  """
  Computes the maximum path length in a graph.

  Args:

    df: dataframe containing the summary of graph data for a given semantic
    graph.

  Returns:

    The maximum path length in the graph.
  """
  return np.nanmax(df['distance'][np.isfinite(df['distance'])])


def get_log_likelihood(df:pd.DataFrame) -> float:
  """
  Learns a mixed-effects model predicting the relationship between minimum path
  length between prime and target words and reaction time, and then outputs
  the log-likelihood of the Hutchinson et al. data according to that model.

  Args:

    df: dataframe containing the summary of graph data for a given semantic
    graph.

  Returns:

    The log-likelihood of the Hutchinson et al. data according to the model.
  """
  max_distance = np.nanmax(df['distance'][np.isfinite(df['distance'])])
  df['distance'] = df['distance'].replace(np.inf, max_distance + 1)
  df['z_distance'] = zscore(df['distance'])
  df['Trial'] = zscore(df['Trial'])
  mod = smf.mixedlm('RT ~ z_distance + Trial + Session',
            data=df,
            groups=df["Subject"],
            re_formula="1 + z_distance + Trial + Session").fit()
  return mod.llf


def get_results(model:str,temperature:int,maxout:int) -> dict:
  """
  Compute all ey data points for a given model, temperature, and maximum number
  of output tokens.

  Args:

    model: model name
    temperature: temperature parameter
    maxout: maximum number of output tokens

  Returns:
    A dictionary containing the key data points for the given model, temperature,
    and maximum number of output tokens.
  """
  df = get_df(model,temperature,maxout)
  return {
          "model":model,
          "temperature":temperature,
          "maxout":maxout,
          "hallucination_rate":get_hallucination_rate(model,temperature,maxout),
          "connected_graph":connected_graph(df),
          "max_path":max_path(df),
          "log_likelihood":get_log_likelihood(df)
  }

###Generate and Save Dataframe

In [None]:
results = []
for i in tqdm([0,.3,.7,1]):
  for j in tqdm(np.arange(512,2049,512)):
    results.append(get_results("gemini-1.5-pro-001",i,j))

results_df = pd.DataFrame(results)

filename = '/data/combined_graph_data.csv'

results_df.to_csv(filename)