#Notebook for Generating Minimum Path Length Data for Semantic Graphs

This notebook takes contains the code needed to take as input an LLM-generated semantic graph and return a dataframe that adds a column with the minimum path length between the prime and target words to Hutchinson et al.'s data set.

###Imports

In [None]:
import ast
import pandas as pd
import sys
import numpy as np
from tqdm.auto import tqdm
import networkx as nx
from numba import jit
from google.colab import drive
from scipy.stats import zscore

####Load and Clean the List of Words Used in Semantic Priming

In [None]:
df = pd.read_csv('data/priming_data.csv', engine='python')
words_df = df[df['type'] != 'nw']
words_df = words_df[words_df['RT'] != '#NULL!']
words_df['RT'] = [int(i) for i in words_df["RT"]]
words_df['RT'] = zscore(words_df['RT'])
words_df = words_df.reset_index(drop=True)
unique_words = np.concatenate([np.unique(words_df['prime']),
                               np.unique(words_df['target'])])
unique_words = [w.lower() for w in unique_words]

###Define a Function to Obtain Mimumum Path Length Based on the Precomputed Shortest Path Lengths

In [None]:
def get_distance(str: prime, str: target, set: node_distances) -> int:
  """
  Obtains the minimum path length between a target word and a prime word.

  Args:
    prime: the prime word

    prime: the target word

    node_distances: the precomputed shortest path lengths between all nodes

  Returns:
    The minimum path length between the prime and target words
  """
  prime_dists = node_distances.get(prime, {})
  return prime_dists.get(target, float('inf')) #Return inf if no path exists

###Set the Model, Temperature, and Maximum Output Tokens Used to Generate the Graph You Wish to Obtain

In [None]:
model = "gemini-1.5-pro-001"

temperature = 0

max_output_tokens = 512

###Generate and Save the Dataframe Appending a Column with the Minimal Path Length Between Each Target-Prime Pair

In [None]:
filename = (f'/data/semantic_gaphs/graph_model_{model}_'
            f'temp{temperature}_maxoutput{max_output_tokens}.txt')


f = open(filename, "r")
formatted_graphs_string = f.read()
f.close()
formatted_graphs = ast.literal_eval(formatted_graphs_string)

# Build the graph
G = nx.Graph()
for l in formatted_graphs:
    G.add_edges_from((triple['subject'], triple['target']) for triple in l)

# Precompute shortest path lengths for all nodes
node_distances = {node: nx.single_source_shortest_path_length(G, node)
                        for node in tqdm(unique_words) if node in G.nodes}

# Calculate all distances
all_distances = [
    get_distance(
        words_df["prime"].iloc[i].lower(),
        words_df["target"].iloc[i].lower(),
        node_distances
    )
    for i in tqdm(range(len(words_df)))
]

# Add distances to the DataFrame
words_df['distance'] = all_distances

#Save dataframe with results to .csv

filename = (f'/data/path_length_rt_data/'
            f'/results_model_{model}_temp{temperature}_maxout'
            f'{max_output_tokens}.csv')

words_df.to_csv(filename)