In [1]:
import pandas as pd
from llm_pipeline.llm_methods import DataFrameProcessor 
import csv


In [5]:
# This section generates a description for the record, based on the name and other values of the record.

from llm_pipeline.llm_methods import DataFrameProcessor  # Import the new chainable processor

# Load test data from CSV file into a DataFrame
try:
    df = pd.read_csv('./data/superData.csv')
except pd.errors.ParserError as e:
    print(f"Error reading CSV file: {e}")
    raise

# Select and sample the relevant columns.
reduced_df = df[['name', 'modified_size', 'can_fly', 'good', 'danger']]
reduced_df = reduced_df.sample(n=1000)

# Generate a short description of the creature.
processed_df = (
    DataFrameProcessor(reduced_df)
    .llm_call(
        prompt_template="""The following record is a creature type for a fantasy game. Please provide a description for the creature.
The size of the creature is on a scale from very small (0) to extremely large (10). A size of 10 is unbelievably huge—world-threatening in scale.
There are ranges for each number: a 1 is very small (from a bug to a fish), a 2 is a small animal (cat, dog, fox, etc.), and 3 is roughly a normal person.
A 4 represents a large animal (horse, bear, tiger, lion), while a 5 is a giant animal such as a dragon. Generally, an animal with a larger value would defeat one with a smaller value in combat.
Animals may be modified from their nominal state; this modification, reflected in the name, can affect their size or behavior.
The can_fly field indicates flight ability (1 for can fly, -1 for cannot, 0 for unspecified). Similarly, the good field indicates moral alignment (1 for good, -1 for evil, 0 for unspecified).
The danger field indicates how dangerous the creature is (1 for dangerous, -1 for not dangerous, 0 for unspecified).

Please respond with a short, creative description of this creature. Do not include a literal size number (e.g., 'size 2')—instead, be descriptive and creative in your comparisons. Limit your response to 60 words.
{record_details}""",
        fields=['name', 'modified_size', 'can_fly', 'good', 'danger'],
        output_key="description"
    )
    .get_df()
)

print("Added Descriptions")

# Generate a short description of the dangerousness of the creature.
processed_df = (
    DataFrameProcessor(processed_df)
    .llm_call(
        prompt_template="""The following record is a creature type for a fantasy game.
The size of the creature is on a scale from very small (0) to extremely large (10). A size of 10 is unbelievably huge—world-threatening in scale.
There are ranges for each number: a 1 is very small (from a bug to a fish), a 2 is a small animal (cat, dog, fox, etc.), and 3 is roughly a normal person.
A 4 represents a large animal (horse, bear, tiger, lion), while a 5 is a giant animal such as a dragon. Generally, an animal with a larger value would defeat one with a smaller value in combat.
Animals may be modified from their nominal state; this modification, reflected in the name, can affect their size or behavior.
The can_fly field indicates flight ability (1 for can fly, -1 for cannot, 0 for unspecified). Similarly, the good field indicates moral alignment (1 for good, -1 for evil, 0 for unspecified).
The danger field indicates how dangerous the creature is (1 for dangerous, -1 for not dangerous, 0 for unspecified).

A description of the animal is also provided. Your task is to provide a creative description of how dangerous the creature is.
This should be primarily based on the danger field, but you may also consider the size, flight ability, and moral alignment of the creature.
For example, a small, evil creature that can fly might be described as 'small but vicious, and agressive nausence that should be destroyed' while
 a large, good creature that cannot fly might be described as 'a gentle giant that inokes fear and awe.  It poses no threat, but must be respected'.

Please respond with a short, creative description of how dangerous the creature is. Do not include a literal size number, nor the creature's name. Limit your response to 10 words.
{record_details}""",
        fields=['name', 'modified_size', 'can_fly', 'good', 'danger', 'description'],
        output_key="danger_description"
    )
    .get_df()
)
print("Added Danger Descriptions")

# Generate a short description of the Environment of the creature.
processed_df = (
    DataFrameProcessor(processed_df)
    .llm_call(
        prompt_template="""The following record is a creature type for a fantasy game.
The size of the creature is on a scale from very small (0) to extremely large (10). A size of 10 is unbelievably huge—world-threatening in scale.
There are ranges for each number: a 1 is very small (from a bug to a fish), a 2 is a small animal (cat, dog, fox, etc.), and 3 is roughly a normal person.
A 4 represents a large animal (horse, bear, tiger, lion), while a 5 is a giant animal such as a dragon. Generally, an animal with a larger value would defeat one with a smaller value in combat.
Animals may be modified from their nominal state; this modification, reflected in the name, can affect their size or behavior.
The can_fly field indicates flight ability (1 for can fly, -1 for cannot, 0 for unspecified). Similarly, the good field indicates moral alignment (1 for good, -1 for evil, 0 for unspecified).
The danger field indicates how dangerous the creature is (1 for dangerous, -1 for not dangerous, 0 for unspecified).

A description of the animal is also provided. Your task is to provide a creative the environment in which the creature is most likely to be found.
This should be primarily based on the description and size of the creature, but you may also consider the flight ability, moral alignment, and danger of the creature.
For example, a large flying creature should be in an outdoors environment. while an evil creature might be found in a dark, forboding place.

Please respond with a short, creative description of the environment in which the creature is found.  Do not include the creature's name or description, focus on the environment.
Limit your response to 10 words.
{record_details}""",
        fields=['name', 'modified_size', 'can_fly', 'good', 'danger', 'description'],
        output_key="environment"
    )
    .get_df()
)

#persist the processed data to a CSV file with the generated descriptions
# This will allow us save the generated descriptions for later usecd 
processed_df.to_csv('./data/processed_monsters2.csv', index=False)

print(processed_df.head(10))

KeyboardInterrupt: 

In [45]:
# This section adds an Embedding fields each of the generated descriptions to support future kNN searches.
# Process the DataFrame to generate embeddings from "name" and "description"
processed_df = (
    DataFrameProcessor(processed_df)
    .generate_embeddings(
        fields=["name", "description"],
        output_key="description_embedding"  # This is where the embedding will be stored
    )
    .get_df()
)

# Add an Embedding field to the DataFrame based on the "danger_description" field.
processed_df = (   
    DataFrameProcessor(processed_df)
    .generate_embeddings(
        fields=["danger_description"],
        output_key="danger_embedding"  # This is where the embedding will be stored
    )
    .get_df()
)

# Add an Embedding field to the DataFrame based on the "environment" field.
processed_df = (
    DataFrameProcessor(processed_df)
    .generate_embeddings(
        fields=["environment"],
        output_key="environment_embedding"  # This is where the embedding will be stored
    )
    .get_df()
)



# Count the processed records and save the DataFrame.
count = len(processed_df)
print(f"Processed record count: {count}")
processed_df.to_pickle('./data/monsters_with_embeddings2.pkl')

Processed record count: 1000


In [None]:
# This section demonstrates finding records related to a string query. 
# This returns the top matches where the description is similar to the query string.
# This is not garuneed to return all matches, but is very fast since it uses the embeddings to find
# the top K matches are are most similar to the query string, and then only uses the LLM to rate the top K matches.
# This is cheaper and faster than using the LLM to rate all records, and is good for finding SOME matches.

# The user provides query string; the processor will prefrom a kNN search based on the query string
# Then we perform a double-check by passing the k records to the LLM and asking it to rate the match of the description to the query string
# The we apply a filter to the k records to only return good matches
# so this is going to return a MAX of K records, but only good matches


reloaded_df = pd.read_pickle('./data/monsters_with_embeddings2.pkl')

query_str = "a huge lumbering beast with a thick hide and sharp claws"

# Define a filter function to keep rows with a rating greater than 7.
def filter_function(df: pd.DataFrame) -> pd.Series:
    return df['knn_rating'].astype(float) > 7

# Use the chainable interface:
result_df = (
    DataFrameProcessor(reloaded_df)
    .knn_filter(query=query_str, k=20, embedding_column="description_embedding")
    .llm_call(
        prompt_template=f"""The following is a description of a creature type for a fantasy game.
Please rate the match of the description to the following statement: {query_str}
Rate the match on a scale from 1 to 10, where 1 is a very poor match and 10 is an excellent match.
{{record_details}}""",
        fields=["description"],
        output_key="knn_rating"
    )
    .filter(filter_function)  # Chainable filtering step.
    .get_df()
)

# Display the top 20 records.
print(result_df[['name', 'description']].head(20))


                          name  \
9020       big ferocious sloth   
4497                 big troll   
3716             murderous pig   
22508  ice-covered giant sloth   

                                             description  
9020   In the heart of ancient forests dwells the big...  
4497   This towering big troll lumbers through the sh...  
3716   This hulking beast resembles a monstrous boar,...  
22508  The ice-covered giant sloth lumbers through fr...  


In [79]:
# This section introduces Clusters.clustered_df
# First, we generate Cluster IDs using the DBSCAN method, and assign Cluster IDs to each row in the data frame
# Note: the clustering paramters eps and min_samples are important to tweak to get good clusters
# Also Note: After processing, rows with a cluster id of -1 were not actually clustered
# 
# Clustering gives Groups of related rows based on the proximity of their embeddings in latent space.  
# These should have similar meanings. Clusters can be then processed independently.


reloaded_df = pd.read_pickle('./data/monsters_with_embeddings2.pkl')

clustered_df = (
    DataFrameProcessor(reloaded_df)
    .cluster_dbscan(embedding_column="description_embedding", output_key="description_cluster_id", eps=0.7, min_samples=3)
    .get_df()
)

clustered_df = (
    DataFrameProcessor(clustered_df)
    .cluster_dbscan(embedding_column="danger_embedding", output_key="danger_cluster_id", eps=0.6, min_samples=3)
    .get_df()
)

clustered_df = (
    DataFrameProcessor(clustered_df)
    .cluster_dbscan(embedding_column="environment_embedding", output_key="environment_cluster_id", eps=0.58, min_samples=3)
    .get_df()
)

# Count the rows in each cluster.
cluster_counts = clustered_df['environment_cluster_id'].value_counts().sort_index()

# Print the total number of clusters.
print(f"Number of clusters: {len(cluster_counts)}")

# Print the count of rows in each cluster.
print("Rows per cluster:")
print(cluster_counts)

clustered_df.to_pickle('./data/monsters_with_embeddings3.pkl')

Number of clusters: 15
Rows per cluster:
environment_cluster_id
-1     411
 0     233
 1     257
 2      16
 3      32
 4       3
 5       4
 6       5
 7       4
 8      17
 9       3
 10      3
 11      3
 12      3
 13      6
Name: count, dtype: int64


In [80]:
# This section demonstrates generating a summary phrase for each cluster.
# It will get the cluster IDs, and loop over each cluster, sampling 20 records, and generating a summary phrase for each cluster.
# We exclude cluster -1, which is the unclustered records.

clustered_df = pd.read_pickle('./data/monsters_with_embeddings3.pkl')

# Get all unique clusters, excluding cluster 0.
clusters = sorted(clustered_df['environment_cluster_id'].unique())
clusters = [c for c in clusters if c != -1]

# Loop over each cluster, sample 20 records, and generate a summary phrase.
for cluster in clusters:
    # Select the records that belong to the current cluster.
    cluster_df = clustered_df[clustered_df['environment_cluster_id'] == cluster]
    
    # Sample up to 20 records (if there are fewer than 20, use them all).
    sample_df = cluster_df if len(cluster_df) < 20 else cluster_df.sample(n=20, random_state=42)
    
    # Define the prompt template for the LLM. The '{record_details}' placeholder
    # will be replaced with the combined details of each record (using the selected fields).
    prompt_template = (
        f"Below are sample records from cluster {cluster}:\n"
        "{record_details}\n\n"
        "Please provide a short summary phrase for this cluster.  Try to limit the response to 10 words or less"
    )
    
    # Call the LLM with the sample DataFrame.
    # In this example, we assume that the 'name' and 'description' fields are desired.
    summary = (
        DataFrameProcessor(sample_df)
        .call_llm_with_dataframe(prompt_template, fields=["environment"])
    )
    
    # Print the cluster number and the generated summary.
    print(f"Cluster {cluster} environment summary: {summary}\n{'-' * 60}\n")

Cluster 0 environment summary: Sunlit glades and meadows filled with vibrant flora.
------------------------------------------------------------

Cluster 1 environment summary: Shadowy forests filled with whispers and hidden dangers.
------------------------------------------------------------

Cluster 2 environment summary: Shadowy, damp alleyways filled with darkness and mystery.
------------------------------------------------------------

Cluster 3 environment summary: Desolate landscapes under dark, stormy skies.
------------------------------------------------------------

Cluster 4 environment summary: Enchanted meadows under picturesque twilight and night skies.
------------------------------------------------------------

Cluster 5 environment summary: Stormy landscapes with cliffs and dark valleys.
------------------------------------------------------------

Cluster 6 environment summary: Tranquil water bodies in lush green settings.
-----------------------------------------

In [3]:
# This section demonstrates a large-scale search for records related to a string query, using the batch processing method.
# This method passes multiple records to the LLM is a batch, and the consolidates the batches
# Its going to be faster and cheaper then processing each record individually,
# Additionally, this approach my be good for finding groups of related records if they are in a batch
# But, there's still risk that groups may not be detected if they are split across batches

# For more exact results, use the exhaustive search method, which processes each record individually.
# For faster searches, use the kNN search method or other groupings

# This is a nice balance between the two, especially if you can't cluster or group records together.
# This will still inlcude all results; it just may not find relationships between records that are split across batches.
# And individual records may get supressed during the colidation step.


df = pd.read_pickle('./data/monsters_with_embeddings3.pkl')

batch_prompt = (
    "Below are a series of records for a fantasy game:\n"
    "{record_details}\n\n"
    "Please identify Very Large creatures currently in an underwater enviroment.\n"
    "Respond only with the list of creature names."

)

# Use the chainable processor to process in batches and consolidate the results.
final_summary = DataFrameProcessor(df).call_llm_in_batches(
    prompt_template=batch_prompt,
    fields=["name", "description", "danger_description", "environment"],  # adjust fields as needed
    batch_size=100,
    consolidation_prefix=(
        "The following responses were generated for each batch. "
        "Please consolidate the list of creature names"
    )
)

print("Final Consolidated Summary:")
print(final_summary)

Final Consolidated Summary:
- giant murderous minnow
- giant invisible dog
- huge microscopy bass
- petrified large kraken
- lightning dead kraken
- enraged weak shark
- crazy trout
- huge kind leviathan
- thunderous leviathan
- enormous shadowy leviathan
- giant ancient trout
- tired ancient leviathan
- ancient shadowy shark
- aquatic knoble unicorn
- aquatic person
- Regular Ancient Leviathan
- Ice-covered tired shark
- Aquatic enraged minotaur
- enormous armored minnow
- ancient poisonous orc
- armored dead mermaid
- lightning satanic shark
- petrified satanic shark
- giant raging fish
- earth-bound enormous fish
- big ice-covered trout
- unholy huge shark
- humongous angry leviathan


In [13]:

df = pd.read_pickle('./data/monsters_with_embeddings3.pkl')

batch_prompt = (
    "Below are a series of records for a fantasy game:\n"
    "{record_details}\n\n"
    "I'm looking for flying creatures from the above list that have artificial weapons.\n"
    "Meaning, not natural weapons like claws, teeth, horns, etc.  I'm looking for creatures with weapons like swords, guns, etc.\n"
    "Please carefully review the above list; and identify flying beings such weapons."
    "Think about these creatures, and finally respond with a list in the form of 'Creature Name: Weapon'"
)

# Use the chainable processor to process in batches and consolidate the results.
final_summary = DataFrameProcessor(df).call_llm_in_batches(
    prompt_template=batch_prompt,
    fields=["name", "description", "danger_description", "environment"],  # adjust fields as needed
    batch_size=500,
    consolidation_prefix=(
        "The following responses were generated for each batch. "
        "Please consolidated the list of creature names and weapons. Produce your response in the form of 'Creature Name: Weapon'"
    )
)

print("Final Consolidated Summary:")
print(final_summary)

Final Consolidated Summary:
flying fire-breathing priest: flaming breath  
evil magical sith: lightsaber  
tiny mini jedi: weapon of energy  
winged fire-breathing person: blazing breath  
angry person: weapon of chaos (emotion)  
cursed lightning giant: electric strike  
fire-breathing flying priest: flame breath  
angry lightning lion: excessive force
