In [20]:
import pandas as pd
from llm_pipeline.pipeline import GenerateEmbeddingsStep, DataPipeline, kNNFilterStep, LLMCallWithDataFrame, LLMCallStep
import csv


In [21]:

# Load test data from CSV file into a DataFrame
try:
    df = pd.read_csv('./data/superData.csv')
except pd.errors.ParserError as e:
    print(f"Error reading CSV file: {e}")
    raise

reduced_df = df[['name', 'modified_size', 'can_fly', 'good', 'danger']]
reduced_df = reduced_df.sample(n=1)

# Generate a description for each record in the DataFrame
gen_description = LLMCallStep(
    prompt_template="""The following record is a creature type for a fantasy game. Please provide a description for the creature.\
    The size of the creature is a scale from very small (0), to extremely large (10). A size of 10 is unbelievealy huge; as world-threatening size.\
    There are ranges in each number.  A 1 is small, from a bug to a fish.  A 2 is a small animal, cat, dog, fox, etc. 3 is a normal person.\
    A size of 4 is a large animal, a horse, bear, tiger, lion.  A 5 is a giant animal such a dragon.  Generally, an animal on value larger than another would defeat it in combat.\
    Animals may be modified from their nominal state.  This modification is reflected in the name, and may result in it having a diffent size, or behavior.\
    The can_fly field indicates whether the creature can fly. A value of 1 indicates that the creature can fly, -1 means it cannot, and 0 is not specified.  Different instances of this type may or many not be able to fly.\
    Similarly, the good field indicates whether the creature is good or evil. A value of 1 indicates that the creature is good, -1 means it is evil, and 0 is not specified.\
    The danger field indicates how dangerous the creature is. A value of 1 indicates that the creature is dangerous, -1 means it is not dangerous, and 0 is not specified.\
    
    Please respond with a short creative description of this creature.  Do not use a size number in your description (such as 'size 2') 
    instead be descriptive and creative in your comparisions.  Limit your response to 60 words.
    {record_details}""",
    output_key="description"
)

# Create the pipeline with all steps.
pipeline = DataPipeline(steps=[gen_description])

# Run the pipeline on the DataFrame.
processed_df = pipeline.run(reduced_df)

processed_df.head(20)

KeyboardInterrupt: 

In [15]:
#persist the processed data to a CSV file 
# This will allow us save the generated descriptions for later usecd 
processed_df.to_csv('./data/processed_monsters2.csv', index=False)

In [16]:

# Generate one embedding from title & description.
gen_embed = GenerateEmbeddingsStep(
    output_key="embedding",
    fields=["name", "description"]
)

# Create the pipeline with all steps.
pipeline = DataPipeline(steps=[gen_embed])

# Run the pipeline on the DataFrame.
next_df = pipeline.run(processed_df)
count = len(next_df)

print(f"Processed record count: {count}")
next_df.to_pickle('./data/monsters_with_embeddings.pkl')


Processed record count: 500


In [23]:
# A simple KNN filter pipeline

# Reload the DataFrame.
reloaded_df = pd.read_pickle('./data/monsters_with_embeddings.pkl')

# Use a kNN filter on one of the embeddings:
knn_filter = kNNFilterStep(
    query="quiet, silly little fantasy creature",
    k=20,
    embedding_column="embedding"
)

# Create the pipeline with all steps.
pipeline = DataPipeline(steps=[knn_filter])

# Run the pipeline on the DataFrame.
out_df = pipeline.run(reloaded_df)

out_df[['name', 'description']].head(20)

Unnamed: 0,name,description
6123,plain mini cat,"This diminutive feline roams quietly, a soft p..."
1471,mini fairy,This enchanting creature flits through gardens...
25795,small thunderous pixie,The small thunderous pixie resembles a shimmer...
1419,small fairie,"Delicate and whimsical, the small fairy flits ..."
9035,little ethereal ogre,The little ethereal ogre flits through the twi...
25771,crazy satanic fairie,"This mischievous creature resembles a tiny, tw..."
22916,little ethereal orc,This diminutive orc flits through the air like...
14864,sleeping tiny nymph,The sleeping tiny nymph resembles a delicate f...
15833,winged invisible fairy,This elusive creature flits through the moonli...
18433,tiny nice basilisk,"The tiny nice basilisk is a petite, whimsical ..."


In [28]:
# KNN, then use the LLM Classified to really label, then filter to get the top results

from llm_pipeline.pipeline import FilterStep

# Reload the DataFrame.
reloaded_df = pd.read_pickle('./data/monsters_with_embeddings.pkl')

query_str = "a huge lumbering beast with a thick hide and sharp claws"

# Use a kNN filter on one of the embeddings:
knn_filter = kNNFilterStep(
    query=query_str,
    k=20,
    embedding_column="embedding"
)

rate_match = LLMCallStep(
    prompt_template="""The following is a description of a creature type for a fantasy game.
    Please rate the match of the description to the following statement: """ + query_str + """
    Rate the match on a scale from 1 to 10, where 1 is a very poor match and 10 is an excellent match.
    {record_details}""",
    fields=["description"],
    output_key="knn_rating"
)

# Example filter step: filter out rows that are not UI edits.
def filter_function(df: pd.DataFrame) -> pd.Series:
    return df['knn_rating'].astype(float) > 7

filter_step = FilterStep(filter_function)


# Create the pipeline with all steps.
pipeline = DataPipeline(steps=[knn_filter, rate_match, filter_step])

# Run the pipeline on the DataFrame.
out_df = pipeline.run(reloaded_df)

out_df[['name', 'description']].head(20)

Unnamed: 0,name,description
14321,armored ferocious sloth,The armored ferocious sloth lumbers through th...
5651,awful sloth,"The awful sloth is a lumbering beast, resembli..."
22271,humongous sleeping gorrila,This colossal sleeping gorilla sprawls across ...
9634,thunderous awful cyclops,The thunderous awful cyclops towers like a liv...
