# LionLinker Testing Notebook
This notebook is used to test each component of the LionLinker pipeline individually.

In [1]:

import pandas as pd
from lion_linker.core import APIClient, PromptGenerator, LLMInteraction
from lion_linker.utils import clean_data, process_in_batches, parse_response
from lion_linker.lion_linker import LionLinker

# Define the path to the files
input_csv = 'movies.csv'
prompt_file = 'prompt_template.txt'
model_name = 'gemma2'  # Replace with the actual model you are using
api_url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'  # Replace with the actual API URL
api_token = 'lamapi_demo_2023'  # Replace with your API token if applicable
output_csv = 'output_test.csv'
batch_size = 2  # Small batch size for testing

# Initialize the LionLinker instance
lion_linker = LionLinker(input_csv, prompt_file, model_name, api_url, api_token, output_csv, batch_size)


2024-08-21 00:31:57,673 - INFO - Initializing components...
2024-08-21 00:31:57,675 - INFO - Setup completed.


In [3]:
table_summary = lion_linker.generate_table_summary()

2024-08-20 21:38:00,820 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


In [4]:
api_client = lion_linker.api_client
candidates = await api_client.fetch_multiple_entities(["Matrix", "Inception"])
candidates

{'Matrix': [{'id': 'Q44337',
   'name': 'matrix',
   'description': 'rectangular array of numbers, symbols, or expressions, arranged in rows and columns',
   'types': []},
  {'id': 'Q83495',
   'name': 'Matrix',
   'description': '1999 American science fiction action thriller film',
   'types': [{'id': 'Q11424', 'name': 'film'}]},
  {'id': 'Q193825',
   'name': 'Matrix',
   'description': 'structure external to cells, which provides structural support for cells or tissues',
   'types': [{'id': 'Q5058355', 'name': 'cellular component'}]},
  {'id': 'Q190069',
   'name': 'Matrix',
   'description': 'Wikimedia disambiguation page',
   'types': [{'id': 'Q4167410', 'name': 'Wikimedia disambiguation page'}]},
  {'id': 'Q1463013',
   'name': 'matrix',
   'description': 'geological term for the mass of material in which larger grains, crystals or clasts are embedded',
   'types': [{'id': 'Q35758', 'name': 'matter'}]},
  {'id': 'Q489649',
   'name': 'Matrix',
   'description': 'car model',
   't

In [16]:
prompt_file

'prompt_template.txt'

In [4]:
LLMInteraction("gemma2").chat("just a test, reply yes")

2024-08-20 00:02:49,074 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


'yes'

In [38]:
import pandas as pd

# Read the first row from the CSV
row = pd.read_csv(input_csv, chunksize=1).get_chunk(1)

# Generate the string in the format "h1:cell1, h2:cell2, ..."
row_str = ', '.join([f'{col}:{row[col].iloc[0]}' for col in row.columns])

row_str

'title:The Matrix, year:1999, director:The Wachowskis'

In [5]:
table_summary = lion_linker.generate_table_summary()
row = pd.read_csv(input_csv, chunksize=1).get_chunk(1)
column_name = row.columns[0]
entity_mention = row[column_name].values[0]
# Generate the string in the format "h1:cell1, h2:cell2, ..."
row = ', '.join([f'{col}:{row[col].iloc[0]}' for col in row.columns])
api_client = lion_linker.api_client
candidates = api_client.fetch_entities(entity_mention)
prompt = PromptGenerator(prompt_file).generate_prompt(table_summary, row, column_name, entity_mention, candidates)
LLMInteraction("gemma2").chat(prompt)

2024-08-21 00:01:22,190 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2024-08-21 00:01:25,724 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


'Q83495 \n'

In [16]:
prompts = [prompt for i in range(5)]
prompts

['Here is the summary of the table:\nThe table lists movie titles, their release years, and the directors.\n\nHere is the data for the current row:\ntitle:The Matrix, year:1999, director:The Wachowskis\n\nThe column name in question is:\ntitle\n\nThe entity mention is:\nThe Matrix\n\nPossible candidates for the entity are:\n[{"id":"Q83495","name":"The Matrix","description":"1999 American science fiction action thriller film","types":[{"id":"Q11424","name":"film"}]},{"id":"Q1402369","name":"The Matrix","description":"Ukrainian professional boxer","types":[{"id":"Q5","name":"human"},{"id":"Q11338576","name":"boxer"},{"id":"Q4991371","name":"soldier"}]},{"id":"Q13014087","name":"The Matrix","description":"media franchise","types":[{"id":"Q196600","name":"media franchise"}]},{"id":"Q1210827","name":"The Matrix","description":"1999\\u2013present films directed by The Wachowskis","types":[{"id":"Q13593818","name":"film trilogy"}]},{"id":"Q2715974","name":"The Matrix","description":"album by 

In [None]:
# Concatenate all the prompts with a separator
combined_prompts = "\n".join(prompts)
print(combined_prompts)

In [10]:
len(prompts)

3

In [17]:
# Define the final instruction
final_instruction = """
Solve each task separately and provide the QID for each entity mention from the candidates listed. Provide only the QID or NIL if none of the candidates is correct for each task.
"""

# Add task labels to the prompts
labeled_prompts = [f"Task {i + 1}:\n{prompt}" for i, prompt in enumerate(prompts)]

# Combine all prompts with separators
combined_prompts = "\n\n---\n\n".join(labeled_prompts)

# Final combined prompt
final_prompt = f"{combined_prompts}\n\n{final_instruction}"

print(final_prompt)

Task 1:
Here is the summary of the table:
The table lists movie titles, their release years, and the directors.

Here is the data for the current row:
title:The Matrix, year:1999, director:The Wachowskis

The column name in question is:
title

The entity mention is:
The Matrix

Possible candidates for the entity are:
[{"id":"Q83495","name":"The Matrix","description":"1999 American science fiction action thriller film","types":[{"id":"Q11424","name":"film"}]},{"id":"Q1402369","name":"The Matrix","description":"Ukrainian professional boxer","types":[{"id":"Q5","name":"human"},{"id":"Q11338576","name":"boxer"},{"id":"Q4991371","name":"soldier"}]},{"id":"Q13014087","name":"The Matrix","description":"media franchise","types":[{"id":"Q196600","name":"media franchise"}]},{"id":"Q1210827","name":"The Matrix","description":"1999\u2013present films directed by The Wachowskis","types":[{"id":"Q13593818","name":"film trilogy"}]},{"id":"Q2715974","name":"The Matrix","description":"album by The Matr

In [27]:
# Concatenate all the prompts with a separator
combined_prompts = "\n\n".join(prompts)

# Define the specific separator to be used in the response
separator = "---"

# Add the final instruction to solve all the tasks
final_instruction = (
    f"Solve all the tasks provided above. "
    f"For each task, identify the correct QID for the entity mention from the candidates listed. "
    f"Provide only the QID or NIL if none of the candidates is correct. "
    f"Keep the same order as the tasks, and separate each answer with the following separator: {separator}"
)

# Final combined prompt
final_prompt = f"{combined_prompts}\n\n{final_instruction}"

# Send the combined prompt to the LLM
response = LLMInteraction("gemma2").chat(final_prompt)

# Print the response
print(response)

# Split the response by the separator to get individual answers
answers = response.split(separator.strip())
answers = [answer.strip() for answer in answers if answer.strip()]  # Clean up any leading/trailing spaces

# Now `answers` contains the individual QIDs or NILs in the same order as the tasks

2024-08-20 22:45:54,343 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Q83495 --- Q83495 --- Q83495 



In [29]:
answers

['Q83495', 'Q83495', 'Q83495']

In [4]:
import json

def reduce_prompt_size(title, year, director, candidates):
    # Create a compact summary of the main data
    data_summary = f"title: {title}, year: {year}, director: {director}"
    
    # Compact the candidates list by removing redundant information
    compact_candidates = []
    for candidate in candidates:
        compact_candidate = {
            "id": candidate["id"],
            "name": candidate["name"],
            "description": candidate["description"].split('.')[0],  # Keep only the first sentence
            "types": [{"id": t["id"], "name": t["name"]} for t in candidate["types"]]
        }
        compact_candidates.append(compact_candidate)
    
    # Convert the compact candidates list to JSON format to keep it short
    compact_candidates_str = json.dumps(compact_candidates, separators=(',', ':'))
    
    # Combine the summary and candidates into the final prompt
    reduced_prompt = f"{data_summary}\n\nCandidates:\n{compact_candidates_str}\n\nReturn QID or NIL."
    
    return reduced_prompt

# Example usage
title = "The Matrix"
year = 1999
director = "The Wachowskis"
candidates
reduced_prompt = reduce_prompt_size(title, year, director, candidates)
print(reduced_prompt)

title: The Matrix, year: 1999, director: The Wachowskis

Candidates:
[{"id":"Q83495","name":"The Matrix","description":"1999 American science fiction action thriller film","types":[{"id":"Q11424","name":"film"}]},{"id":"Q1402369","name":"The Matrix","description":"Ukrainian professional boxer","types":[{"id":"Q5","name":"human"},{"id":"Q11338576","name":"boxer"},{"id":"Q4991371","name":"soldier"}]},{"id":"Q13014087","name":"The Matrix","description":"media franchise","types":[{"id":"Q196600","name":"media franchise"}]},{"id":"Q1210827","name":"The Matrix","description":"1999\u2013present films directed by The Wachowskis","types":[{"id":"Q13593818","name":"film trilogy"}]},{"id":"Q2715974","name":"The Matrix","description":"album by The Matrix","types":[{"id":"Q482994","name":"album"}]},{"id":"Q1649612","name":"The Matrix","description":"Pop music writing and production team","types":[{"id":"Q84310035","name":"record production team"},{"id":"Q54982412","name":"songwriting team"}]},{"id"

In [15]:
import tiktoken

def count_tokens(prompt: str, model: str = "gpt-3.5-turbo"):
    """
    Counts the number of tokens in a given prompt using the specified model's tokenizer.
    
    Args:
    prompt (str): The text prompt to be tokenized.
    model (str): The model's name for which to tokenize the prompt (default is "gpt-3.5-turbo").
    
    Returns:
    int: The number of tokens in the prompt.
    """
    # Load the appropriate tokenizer for the specified model
    encoding = tiktoken.encoding_for_model(model)
    
    # Encode the prompt into tokens
    tokens = encoding.encode(prompt)
    
    # Return the number of tokens
    return len(tokens)

# Example usage
print(f"Number of tokens: {count_tokens(prompt)}")
print(f"Number of tokens: {count_tokens(final_prompt)}")

Number of tokens: 526
Number of tokens: 5346


In [36]:
from transformers import GPT2Tokenizer

def compute_context_window(prompt, model_name="gpt2"):
    # Initialize the tokenizer for the specified model
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    
    # Tokenize the prompt
    tokens = tokenizer.encode(prompt)
    
    # Count the number of tokens
    num_tokens = len(tokens)
    
    # Print the result
    print(f"Number of tokens used: {num_tokens}")
    
    return num_tokens


num_tokens = compute_context_window(prompt)

# Assuming gemma2B has a context window of 2048 tokens (adjust this based on actual model specs)
max_context_window = 2048

if num_tokens > max_context_window:
    print("Warning: The prompt exceeds the context window of the model!")
else:
    print("The prompt is within the model's context window.")

Number of tokens used: 552
The prompt is within the model's context window.


In [1]:
 # Replace placeholders in the template with actual values
template = open(prompt_file, 'r').read()
# Replace placeholders in the template with actual values
 # Replace placeholders in the template with actual values
# Define a dictionary with placeholders as keys and corresponding values
replacements = {
    '[SUMMARY]': table_summary,
    '[ROW]': row,
    '[COLUMN NAME]': column_name,
    '[ENTITY MENTION]': entity_mention
}

# Replace each placeholder using the dictionary
for placeholder, value in replacements.items():
    template = template.replace(placeholder, value)
template

NameError: name 'prompt_file' is not defined

## Test Data Cleaning
Test the `clean_data` function to ensure it works correctly.

In [2]:

# Test the clean_data function
df = pd.read_csv(input_csv)
cleaned_df = clean_data(df)
cleaned_df.head()


Unnamed: 0,title,year,director
0,The Matrix,1999,The Wachowskis
1,Inception,2010,Christopher Nolan
2,The Shawshank Redemption,1994,Frank Darabont
3,The Godfather,1972,Francis Ford Coppola
4,Pulp Fiction,1994,Quentin Tarantino


## Test API Interaction
Test the `APIClient` class to ensure it can fetch data correctly.

In [4]:
# Step 1: Generate High-Level Table Summary with Sampling and Parameterized Prompt
llm_interaction = lion_linker.llm_interaction

# Specify a custom prompt for the summary (optional)
custom_summary_prompt = "Provide a high-level overview of the key trends and patterns in this dataset."

# Generate the table summary using the provided prompt
table_summary = lion_linker.generate_table_summary(cleaned_df, prompt=custom_summary_prompt)
table_summary

'Summary for 5 rows: Provide a high-level overview of the key trends and patterns in this dataset.'

In [5]:

# Step 2: Select Correct Candidate
# Example: Fetch candidates for the first entity mention in the DataFrame
api_client = lion_linker.api_client
sample_entity = cleaned_df.iloc[0]['title']
candidates = api_client.fetch_entities(sample_entity)
candidates  # This will output the full JSON response


[{'id': 'Q83495',
  'name': 'The Matrix',
  'description': '1999 American science fiction action thriller film',
  'types': [{'id': 'Q11424', 'name': 'film'}]},
 {'id': 'Q1402369',
  'name': 'The Matrix',
  'description': 'Ukrainian professional boxer',
  'types': [{'id': 'Q5', 'name': 'human'},
   {'id': 'Q11338576', 'name': 'boxer'},
   {'id': 'Q4991371', 'name': 'soldier'}]},
 {'id': 'Q13014087',
  'name': 'The Matrix',
  'description': 'media franchise',
  'types': [{'id': 'Q196600', 'name': 'media franchise'}]},
 {'id': 'Q1210827',
  'name': 'The Matrix',
  'description': '1999–present films directed by The Wachowskis',
  'types': [{'id': 'Q13593818', 'name': 'film trilogy'}]},
 {'id': 'Q2715974',
  'name': 'The Matrix',
  'description': 'album by The Matrix',
  'types': [{'id': 'Q482994', 'name': 'album'}]},
 {'id': 'Q1649612',
  'name': 'The Matrix',
  'description': 'Pop music writing and production team',
  'types': [{'id': 'Q84310035', 'name': 'record production team'},
   {'

In [6]:

# Test a Single Prompt to Verify Correctness
sample_row = cleaned_df.iloc[0].to_dict()
single_prompt = lion_linker.prompt_generator.generate_prompt(table_summary, sample_row, 'title', sample_entity, candidates)
single_prompt


'Here is the summary of the table:\n[SUMMARY]\n\nHere is the data for the current row:\n[ROW]\n\nThe column name in question is:\n[COLUMN NAME]\n\nThe entity mention is:\n[ENTITY MENTION]\n\nPossible candidates for the entity are:\n[CANDIDATES]\n\nPlease provide only the QID. If no correct candidate is present, provide NIL.'

In [7]:

# Test API interaction with a single query
api_client = lion_linker.api_client
sample_entity = cleaned_df.iloc[0]['title']
api_response = api_client.fetch_entities(sample_entity)
api_response


[{'id': 'Q83495',
  'name': 'The Matrix',
  'description': '1999 American science fiction action thriller film',
  'types': [{'id': 'Q11424', 'name': 'film'}]},
 {'id': 'Q1402369',
  'name': 'The Matrix',
  'description': 'Ukrainian professional boxer',
  'types': [{'id': 'Q5', 'name': 'human'},
   {'id': 'Q11338576', 'name': 'boxer'},
   {'id': 'Q4991371', 'name': 'soldier'}]},
 {'id': 'Q13014087',
  'name': 'The Matrix',
  'description': 'media franchise',
  'types': [{'id': 'Q196600', 'name': 'media franchise'}]},
 {'id': 'Q1210827',
  'name': 'The Matrix',
  'description': '1999–present films directed by The Wachowskis',
  'types': [{'id': 'Q13593818', 'name': 'film trilogy'}]},
 {'id': 'Q2715974',
  'name': 'The Matrix',
  'description': 'album by The Matrix',
  'types': [{'id': 'Q482994', 'name': 'album'}]},
 {'id': 'Q1649612',
  'name': 'The Matrix',
  'description': 'Pop music writing and production team',
  'types': [{'id': 'Q84310035', 'name': 'record production team'},
   {'

## Test Prompt Generation
Test the `PromptGenerator` class to ensure it generates prompts correctly.

In [8]:

# Step 1: Generate High-Level Table Summary with Sampling and Parameterized Prompt
llm_interaction = lion_linker.llm_interaction

# Specify a custom prompt for the summary (optional)
custom_summary_prompt = "Provide a high-level overview of the key trends and patterns in this dataset."

# Generate the table summary using a sample of the data
table_summary = lion_linker.generate_high_level_summary(cleaned_df, prompt=custom_summary_prompt)
table_summary


AttributeError: 'LionLinker' object has no attribute 'generate_high_level_summary'

## Run Full Pipeline in Batches
Test the entire pipeline using the LionLinker class.

In [None]:

# Run the full pipeline
lion_linker.run()

# Display the output CSV
output_df = pd.read_csv(output_csv)
output_df.head()
