# LionLinker Testing Notebook
This notebook is used to test each component of the LionLinker pipeline individually.

In [1]:

import pandas as pd
from lion_linker.core import APIClient, PromptGenerator, LLMInteraction
from lion_linker.utils import clean_data, process_in_batches, parse_response
from lion_linker.lion_linker import LionLinker

# Define the path to the files
input_csv = 'movies.csv'
prompt_file = 'prompt_template.txt'
model_name = 'gemma2'  # Replace with the actual model you are using
api_url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'  # Replace with the actual API URL
api_token = 'lamapi_demo_2023'  # Replace with your API token if applicable
output_csv = 'output_test.csv'
batch_size = 2  # Small batch size for testing

# Initialize the LionLinker instance
lion_linker = LionLinker(input_csv, prompt_file, model_name, api_url, api_token, output_csv, batch_size)


2024-08-20 00:27:42,998 - INFO - Initializing components...
2024-08-20 00:27:43,002 - INFO - Setup completed.


In [3]:
table_summary = lion_linker.generate_table_summary()

2024-08-20 00:09:01,422 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


In [None]:
api_client = lion_linker.api_client
candidates = api_client.fetch_entities("matrix")
candidates

In [16]:
prompt_file

'prompt_template.txt'

In [4]:
LLMInteraction("gemma2").chat("just a test, reply yes")

2024-08-20 00:02:49,074 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


'yes'

In [2]:
table_summary = lion_linker.generate_table_summary()
row = pd.read_csv(input_csv, chunksize=1).get_chunk(1)
column_name = row.columns[0]
entity_mention = row[column_name].values[0]
row = row.to_string()
api_client = lion_linker.api_client
candidates = api_client.fetch_entities(entity_mention)
PromptGenerator(prompt_file).generate_prompt(table_summary, row, column_name, entity_mention, candidates)

2024-08-20 00:27:49,463 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


'Here is the summary of the table:\nThe table lists popular movies, providing their titles, release years, and directors. \n\n\nLet me know if you\'d like a summary focusing on a specific aspect of the data (like genres or box office performance) - I can tailor it further!\n\nHere is the data for the current row:\n        title  year        director\n0  The Matrix  1999  The Wachowskis\n\nThe column name in question is:\ntitle\n\nThe entity mention is:\nThe Matrix\n\nPossible candidates for the entity are:\n[\n  {\n    "id": "Q83495",\n    "name": "The Matrix",\n    "description": "1999 American science fiction action thriller film",\n    "types": [\n      {\n        "id": "Q11424",\n        "name": "film"\n      }\n    ]\n  },\n  {\n    "id": "Q1402369",\n    "name": "The Matrix",\n    "description": "Ukrainian professional boxer",\n    "types": [\n      {\n        "id": "Q5",\n        "name": "human"\n      },\n      {\n        "id": "Q11338576",\n        "name": "boxer"\n      },\n 

'        title  year        director\n0  The Matrix  1999  The Wachowskis'

In [4]:
 # Replace placeholders in the template with actual values
template = open(prompt_file, 'r').read()
# Replace placeholders in the template with actual values
 # Replace placeholders in the template with actual values
# Define a dictionary with placeholders as keys and corresponding values
replacements = {
    '[SUMMARY]': table_summary,
    '[ROW]': row,
    '[COLUMN NAME]': column_name,
    '[ENTITY MENTION]': entity_mention
}

# Replace each placeholder using the dictionary
for placeholder, value in replacements.items():
    template = template.replace(placeholder, value)
template

'Here is the summary of the table:\ntable_summary\n\nHere is the data for the current row:\nrow\n\nThe column name in question is:\ncolumn_name\n\nThe entity mention is:\nentity_mention\n\nPossible candidates for the entity are:\n[CANDIDATES]\n\nPlease provide only the QID. If no correct candidate is present, provide NIL.'

## Test Data Cleaning
Test the `clean_data` function to ensure it works correctly.

In [2]:

# Test the clean_data function
df = pd.read_csv(input_csv)
cleaned_df = clean_data(df)
cleaned_df.head()


Unnamed: 0,title,year,director
0,The Matrix,1999,The Wachowskis
1,Inception,2010,Christopher Nolan
2,The Shawshank Redemption,1994,Frank Darabont
3,The Godfather,1972,Francis Ford Coppola
4,Pulp Fiction,1994,Quentin Tarantino


## Test API Interaction
Test the `APIClient` class to ensure it can fetch data correctly.

In [4]:
# Step 1: Generate High-Level Table Summary with Sampling and Parameterized Prompt
llm_interaction = lion_linker.llm_interaction

# Specify a custom prompt for the summary (optional)
custom_summary_prompt = "Provide a high-level overview of the key trends and patterns in this dataset."

# Generate the table summary using the provided prompt
table_summary = lion_linker.generate_table_summary(cleaned_df, prompt=custom_summary_prompt)
table_summary

'Summary for 5 rows: Provide a high-level overview of the key trends and patterns in this dataset.'

In [5]:

# Step 2: Select Correct Candidate
# Example: Fetch candidates for the first entity mention in the DataFrame
api_client = lion_linker.api_client
sample_entity = cleaned_df.iloc[0]['title']
candidates = api_client.fetch_entities(sample_entity)
candidates  # This will output the full JSON response


[{'id': 'Q83495',
  'name': 'The Matrix',
  'description': '1999 American science fiction action thriller film',
  'types': [{'id': 'Q11424', 'name': 'film'}]},
 {'id': 'Q1402369',
  'name': 'The Matrix',
  'description': 'Ukrainian professional boxer',
  'types': [{'id': 'Q5', 'name': 'human'},
   {'id': 'Q11338576', 'name': 'boxer'},
   {'id': 'Q4991371', 'name': 'soldier'}]},
 {'id': 'Q13014087',
  'name': 'The Matrix',
  'description': 'media franchise',
  'types': [{'id': 'Q196600', 'name': 'media franchise'}]},
 {'id': 'Q1210827',
  'name': 'The Matrix',
  'description': '1999–present films directed by The Wachowskis',
  'types': [{'id': 'Q13593818', 'name': 'film trilogy'}]},
 {'id': 'Q2715974',
  'name': 'The Matrix',
  'description': 'album by The Matrix',
  'types': [{'id': 'Q482994', 'name': 'album'}]},
 {'id': 'Q1649612',
  'name': 'The Matrix',
  'description': 'Pop music writing and production team',
  'types': [{'id': 'Q84310035', 'name': 'record production team'},
   {'

In [6]:

# Test a Single Prompt to Verify Correctness
sample_row = cleaned_df.iloc[0].to_dict()
single_prompt = lion_linker.prompt_generator.generate_prompt(table_summary, sample_row, 'title', sample_entity, candidates)
single_prompt


'Here is the summary of the table:\n[SUMMARY]\n\nHere is the data for the current row:\n[ROW]\n\nThe column name in question is:\n[COLUMN NAME]\n\nThe entity mention is:\n[ENTITY MENTION]\n\nPossible candidates for the entity are:\n[CANDIDATES]\n\nPlease provide only the QID. If no correct candidate is present, provide NIL.'

In [7]:

# Test API interaction with a single query
api_client = lion_linker.api_client
sample_entity = cleaned_df.iloc[0]['title']
api_response = api_client.fetch_entities(sample_entity)
api_response


[{'id': 'Q83495',
  'name': 'The Matrix',
  'description': '1999 American science fiction action thriller film',
  'types': [{'id': 'Q11424', 'name': 'film'}]},
 {'id': 'Q1402369',
  'name': 'The Matrix',
  'description': 'Ukrainian professional boxer',
  'types': [{'id': 'Q5', 'name': 'human'},
   {'id': 'Q11338576', 'name': 'boxer'},
   {'id': 'Q4991371', 'name': 'soldier'}]},
 {'id': 'Q13014087',
  'name': 'The Matrix',
  'description': 'media franchise',
  'types': [{'id': 'Q196600', 'name': 'media franchise'}]},
 {'id': 'Q1210827',
  'name': 'The Matrix',
  'description': '1999–present films directed by The Wachowskis',
  'types': [{'id': 'Q13593818', 'name': 'film trilogy'}]},
 {'id': 'Q2715974',
  'name': 'The Matrix',
  'description': 'album by The Matrix',
  'types': [{'id': 'Q482994', 'name': 'album'}]},
 {'id': 'Q1649612',
  'name': 'The Matrix',
  'description': 'Pop music writing and production team',
  'types': [{'id': 'Q84310035', 'name': 'record production team'},
   {'

## Test Prompt Generation
Test the `PromptGenerator` class to ensure it generates prompts correctly.

In [8]:

# Step 1: Generate High-Level Table Summary with Sampling and Parameterized Prompt
llm_interaction = lion_linker.llm_interaction

# Specify a custom prompt for the summary (optional)
custom_summary_prompt = "Provide a high-level overview of the key trends and patterns in this dataset."

# Generate the table summary using a sample of the data
table_summary = lion_linker.generate_high_level_summary(cleaned_df, prompt=custom_summary_prompt)
table_summary


AttributeError: 'LionLinker' object has no attribute 'generate_high_level_summary'

## Run Full Pipeline in Batches
Test the entire pipeline using the LionLinker class.

In [None]:

# Run the full pipeline
lion_linker.run()

# Display the output CSV
output_df = pd.read_csv(output_csv)
output_df.head()
