### Notebook to create test data from the prod dataset

In [192]:
import pandas as pd
from langchain_ollama import ChatOllama
from langchain.prompts import ChatPromptTemplate
from langchain.schema import HumanMessage, SystemMessage
pd.set_option('display.max_colwidth', None)

In [193]:
# Create DataFrame from prod CSV

df = pd.read_csv('../prod/JEOPARDY_CSV_TOP3_Categories.csv')
df.head(3)

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer
0,4931,2/6/2006,Double Jeopardy!,SCIENCE,$400,"At sea level at 70 degrees this travels 1,129 feet per second; it speeds up over 1 foot per sec. for each rising degree",sound
1,4931,2/6/2006,Double Jeopardy!,SCIENCE,$800,"The largest tree, the General Sherman in California, is this type, also called a Sierra Redwood",a sequoia
2,4931,2/6/2006,Double Jeopardy!,SCIENCE,"$1,200","(<a href=""http://www.j-archive.com/media/2006-02-06_DJ_13.jpg"" target=""_blank"">Sarah of the Clue Crew reads from the pole vault at Duke University's track in Durham, NC.</a>) In bending an elastic solid, stress is the force causing deformation & this is the 6-letter term for <a href=""http://www.j-archive.com/media/2006-02-06_DJ_13a.jpg"" target=""_blank"">the deformation</a>",strain


In [194]:
# Drop rows that contain URLs to images
df = df[~df['Question'].str.contains(r'\.jpg|\.jpeg|\.png|\.gif|.wmv', case=False, na=False)]
df.head()

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer
0,4931,2/6/2006,Double Jeopardy!,SCIENCE,$400,"At sea level at 70 degrees this travels 1,129 feet per second; it speeds up over 1 foot per sec. for each rising degree",sound
1,4931,2/6/2006,Double Jeopardy!,SCIENCE,$800,"The largest tree, the General Sherman in California, is this type, also called a Sierra Redwood",a sequoia
3,4931,2/6/2006,Double Jeopardy!,SCIENCE,"$1,600",6 elements once known as inert gases are now known by this aristocratic name,noble gases
5,1800,5/29/1992,Double Jeopardy!,SCIENCE,$200,"In metric measurement, 10 millimeters equal 1 of these",a centimeter
6,1800,5/29/1992,Double Jeopardy!,SCIENCE,$400,"Boyle's Law says normally if you double the pressure on a gas, the volume decreases by this amount",one-half


In [195]:
# Get unique categories
df['Category'].nunique()

3

In [196]:
df.columns

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [197]:
df['Category'].value_counts()

Category
LITERATURE          493
SCIENCE             481
AMERICAN HISTORY    413
Name: count, dtype: int64

In [198]:
df.dtypes

ShowNumber     int64
AirDate       object
Round         object
Category      object
Value         object
Question      object
Answer        object
dtype: object

In [199]:
df.head()

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer
0,4931,2/6/2006,Double Jeopardy!,SCIENCE,$400,"At sea level at 70 degrees this travels 1,129 feet per second; it speeds up over 1 foot per sec. for each rising degree",sound
1,4931,2/6/2006,Double Jeopardy!,SCIENCE,$800,"The largest tree, the General Sherman in California, is this type, also called a Sierra Redwood",a sequoia
3,4931,2/6/2006,Double Jeopardy!,SCIENCE,"$1,600",6 elements once known as inert gases are now known by this aristocratic name,noble gases
5,1800,5/29/1992,Double Jeopardy!,SCIENCE,$200,"In metric measurement, 10 millimeters equal 1 of these",a centimeter
6,1800,5/29/1992,Double Jeopardy!,SCIENCE,$400,"Boyle's Law says normally if you double the pressure on a gas, the volume decreases by this amount",one-half


In [200]:
df['Round'].value_counts()

Round
Double Jeopardy!    822
Jeopardy!           532
Final Jeopardy!      33
Name: count, dtype: int64

In [201]:
# Add a new column named MockHumanAnswer
df_updated = df.copy()
df_updated['MockHumanAnswer'] = df_updated['Answer']
print(len(df_updated))

1387


In [202]:
df_updated.head()

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,MockHumanAnswer
0,4931,2/6/2006,Double Jeopardy!,SCIENCE,$400,"At sea level at 70 degrees this travels 1,129 feet per second; it speeds up over 1 foot per sec. for each rising degree",sound,sound
1,4931,2/6/2006,Double Jeopardy!,SCIENCE,$800,"The largest tree, the General Sherman in California, is this type, also called a Sierra Redwood",a sequoia,a sequoia
3,4931,2/6/2006,Double Jeopardy!,SCIENCE,"$1,600",6 elements once known as inert gases are now known by this aristocratic name,noble gases,noble gases
5,1800,5/29/1992,Double Jeopardy!,SCIENCE,$200,"In metric measurement, 10 millimeters equal 1 of these",a centimeter,a centimeter
6,1800,5/29/1992,Double Jeopardy!,SCIENCE,$400,"Boyle's Law says normally if you double the pressure on a gas, the volume decreases by this amount",one-half,one-half


In [203]:
# Function to create a filtered DataFrame based on criteria
def filter_dataframe(df, Categories, ShowNumber=None, StartDate=None):
    # Initial filtering based on Categories and ShowNumber
    if ShowNumber:
        filtered_df = df[(df['Category'].isin(Categories)) & (df['ShowNumber'] == ShowNumber)]
    else:
        filtered_df = df[df['Category'].isin(Categories)]
    
    # Further filtering based on StartDate
    if StartDate:
        StartDate = pd.to_datetime(StartDate, format='%m/%d/%Y')
        filtered_df = filtered_df[filtered_df['AirDate'] >= StartDate]
    
    # Debug: Print the shape of the DataFrame after filtering
    print(f"Shape after initial filtering: {filtered_df.shape}")
    
    # Group by 'Category' and 'Round' and take 5 rows from each group
    filtered_df = filtered_df.groupby(['Category', 'Round']).head(5)
    
    # Debug: Print the shape of the DataFrame after grouping
    print(f"Shape after grouping: {filtered_df.shape}")
    
    return filtered_df


# Convert AirDate to datetime with the specified format
df_updated['AirDate'] = pd.to_datetime(df_updated['AirDate'], format='%m/%d/%Y')

Categories = ['SCIENCE', 'AMERICAN HISTORY', 'LITERATURE']
filtered_df = filter_dataframe(df_updated, Categories)
filtered_df.head()

Shape after initial filtering: (1387, 8)
Shape after grouping: (45, 8)


Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,MockHumanAnswer
0,4931,2006-02-06,Double Jeopardy!,SCIENCE,$400,"At sea level at 70 degrees this travels 1,129 feet per second; it speeds up over 1 foot per sec. for each rising degree",sound,sound
1,4931,2006-02-06,Double Jeopardy!,SCIENCE,$800,"The largest tree, the General Sherman in California, is this type, also called a Sierra Redwood",a sequoia,a sequoia
3,4931,2006-02-06,Double Jeopardy!,SCIENCE,"$1,600",6 elements once known as inert gases are now known by this aristocratic name,noble gases,noble gases
5,1800,1992-05-29,Double Jeopardy!,SCIENCE,$200,"In metric measurement, 10 millimeters equal 1 of these",a centimeter,a centimeter
6,1800,1992-05-29,Double Jeopardy!,SCIENCE,$400,"Boyle's Law says normally if you double the pressure on a gas, the volume decreases by this amount",one-half,one-half


In [204]:
len(filtered_df)

45

In [205]:
len(df_updated)

1387

## Create mock data by making typos on answers

In [206]:
 # Initialize the Ollama model
llm = ChatOllama(
    model = "mistral:7b-instruct",
    temperature=0
)

# Define a function to generate answers with typos and homophones
def generate_variations(answer):
    prompt = ChatPromptTemplate.from_messages([
        SystemMessage(content="""You are an expert in creating text variations on existing Answers. 
        1. Generate exactly 1 variation by repeating 1 letter in the Answer for the given Answer.
        2. Do not include any explanations or additional text.
        3. Only return the 1 variation.
        """),
        HumanMessage(content=f"Answer: {answer}")
    ])
    response = llm.invoke(prompt.format_messages())
    variations = response.content.split('\n')
    return variations

# Create a list to store the variations
variations_list = []

# Iterate over each row in the allcats_df dataframe
for index, row in filtered_df.iterrows():
    variations = generate_variations(row['Answer'])
    for variation in variations:
        new_row = row.copy()
        new_row['MockHumanAnswer'] = variation
        variations_list.append(new_row)

# Convert the list of variations to a DataFrame
variations_df = pd.DataFrame(variations_list)

## Create mock data by answering the question

In [None]:
# Initialize the Ollama model
llm = ChatOllama(
    model="mistral:7b-instruct",
    temperature=0
)

# Define a function to generate fake answers based on the question
def generate_fake_answer(question):
    prompt = ChatPromptTemplate.from_messages([
        SystemMessage(content="""You are an expert in generating fake answers. 
        1. Generate exactly 1 fake answer based on the given Question.
        2. The answer must be no more than 3 words long.
        3. Do not include any explanations or additional text.
        4. Only return the answer with at most 3 words.
        """),
        HumanMessage(content=f"Question: {question}")
    ])
    response = llm.invoke(prompt.format_messages())
    fake_answer = response.content.strip().split()[0]  # Ensure only the first word is taken
    return fake_answer

# Create a list to store the fake answers
fake_answers_list = []

# Iterate over each row in the filtered_df dataframe
for index, row in filtered_df.iterrows():
    fake_answer = generate_fake_answer(row['Question'])
    new_row = row.copy()
    new_row['MockHumanAnswer'] = fake_answer
    fake_answers_list.append(new_row)

# Convert the list of fake answers to a DataFrame
fake_answers_df = pd.DataFrame(fake_answers_list)

In [208]:
fake_answers_df.head()

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,MockHumanAnswer
0,4931,2006-02-06,Double Jeopardy!,SCIENCE,$400,"At sea level at 70 degrees this travels 1,129 feet per second; it speeds up over 1 foot per sec. for each rising degree",sound,Speeds
1,4931,2006-02-06,Double Jeopardy!,SCIENCE,$800,"The largest tree, the General Sherman in California, is this type, also called a Sierra Redwood",a sequoia,Giant
3,4931,2006-02-06,Double Jeopardy!,SCIENCE,"$1,600",6 elements once known as inert gases are now known by this aristocratic name,noble gases,Noble
5,1800,1992-05-29,Double Jeopardy!,SCIENCE,$200,"In metric measurement, 10 millimeters equal 1 of these",a centimeter,Centimeters
6,1800,1992-05-29,Double Jeopardy!,SCIENCE,$400,"Boyle's Law says normally if you double the pressure on a gas, the volume decreases by this amount",one-half,Volume


In [220]:
# Combine the original dataframe with the variations dataframe
allcats_mocks_df = pd.concat([filtered_df, fake_answers_df, variations_df], ignore_index=True)

In [221]:
# Display the updated DataFrame with mock human answers
allcats_mocks_df.head(3)

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,MockHumanAnswer
0,4931,2006-02-06,Double Jeopardy!,SCIENCE,$400,"At sea level at 70 degrees this travels 1,129 feet per second; it speeds up over 1 foot per sec. for each rising degree",sound,sound
1,4931,2006-02-06,Double Jeopardy!,SCIENCE,$800,"The largest tree, the General Sherman in California, is this type, also called a Sierra Redwood",a sequoia,a sequoia
2,4931,2006-02-06,Double Jeopardy!,SCIENCE,"$1,600",6 elements once known as inert gases are now known by this aristocratic name,noble gases,noble gases


In [223]:
specific_question = "The largest tree, the General Sherman in California, is this type, also called a Sierra Redwood"
filtered_rows = allcats_mocks_df[allcats_mocks_df['Question'] == specific_question]
filtered_rows

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,MockHumanAnswer
1,4931,2006-02-06,Double Jeopardy!,SCIENCE,$800,"The largest tree, the General Sherman in California, is this type, also called a Sierra Redwood",a sequoia,a sequoia
46,4931,2006-02-06,Double Jeopardy!,SCIENCE,$800,"The largest tree, the General Sherman in California, is this type, also called a Sierra Redwood",a sequoia,Giant
91,4931,2006-02-06,Double Jeopardy!,SCIENCE,$800,"The largest tree, the General Sherman in California, is this type, also called a Sierra Redwood",a sequoia,an sequoia


In [224]:
allcats_mocks_df.head()

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,MockHumanAnswer
0,4931,2006-02-06,Double Jeopardy!,SCIENCE,$400,"At sea level at 70 degrees this travels 1,129 feet per second; it speeds up over 1 foot per sec. for each rising degree",sound,sound
1,4931,2006-02-06,Double Jeopardy!,SCIENCE,$800,"The largest tree, the General Sherman in California, is this type, also called a Sierra Redwood",a sequoia,a sequoia
2,4931,2006-02-06,Double Jeopardy!,SCIENCE,"$1,600",6 elements once known as inert gases are now known by this aristocratic name,noble gases,noble gases
3,1800,1992-05-29,Double Jeopardy!,SCIENCE,$200,"In metric measurement, 10 millimeters equal 1 of these",a centimeter,a centimeter
4,1800,1992-05-29,Double Jeopardy!,SCIENCE,$400,"Boyle's Law says normally if you double the pressure on a gas, the volume decreases by this amount",one-half,one-half


In [225]:
missing_mock_human_answer_count = allcats_mocks_df['MockHumanAnswer'].isna().sum()
print(f"Number of rows with missing MockHumanAnswer: {missing_mock_human_answer_count}")

Number of rows with missing MockHumanAnswer: 0


In [226]:
none_mock_human_answer_count = (allcats_mocks_df['MockHumanAnswer'] == 'None').sum()
print(f"Number of rows with MockHumanAnswer of None: {none_mock_human_answer_count}")

Number of rows with MockHumanAnswer of None: 0


In [227]:
allcats_mocks_df = allcats_mocks_df[allcats_mocks_df['MockHumanAnswer'] != 'None']
allcats_mocks_df.head(3)

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,MockHumanAnswer
0,4931,2006-02-06,Double Jeopardy!,SCIENCE,$400,"At sea level at 70 degrees this travels 1,129 feet per second; it speeds up over 1 foot per sec. for each rising degree",sound,sound
1,4931,2006-02-06,Double Jeopardy!,SCIENCE,$800,"The largest tree, the General Sherman in California, is this type, also called a Sierra Redwood",a sequoia,a sequoia
2,4931,2006-02-06,Double Jeopardy!,SCIENCE,"$1,600",6 elements once known as inert gases are now known by this aristocratic name,noble gases,noble gases


In [228]:
# Verify counts in DataFrames

# df expected: 
print(f'Length of df: {len(df)}. Expected: 1387.')

# allcats_df expected len: 27. 3^3: questions * 3 categories * 3 rounds
print(f'Length of allcats_df: {len(filtered_df)}. Expected: 45.')

# allcats_mocks_df expected len: len(allcats_df) * 3^2
print(f'Length of allcats_mocks_df: {len(allcats_mocks_df)}. Expected: Varies based on llm generation.')

Length of df: 1387. Expected: 1387.
Length of allcats_df: 45. Expected: 45.
Length of allcats_mocks_df: 135. Expected: Varies based on llm generation.


In [229]:
# Write out the filtered DataFrame as a CSV for use as testing data
allcats_mocks_df.to_csv('mock_data.csv', index=False)