In [1]:
from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'
cuda.empty_cache()
print(device)

cuda


In [2]:
import os, glob
import pandas as pd
import tiktoken

tokenizer = tiktoken.get_encoding("cl100k_base")

In [3]:
import openai
openai.api_base = "https://openai.vocareum.com/v1"
openai.api_key = "YOU API KEY"

In [32]:
COMPLETION_MODEL="gpt-3.5-turbo-instruct"
COMPLETION_TOKEN_LIMIT=3900
EMBEDDING_MODEL="text-embedding-ada-002"
EMBEDDING_TOKEN_LIMIT=8000

In [8]:
q1 = "Can you list all creature cards of type Dragon that were printed in 2024?"
q2 = "It's 2025. What is the name of the last legendary sliver that was printed?"

In [9]:
def get_completion(question):
    prompt = """
Question: {question}
Answer:
""".format(question=question)
    answer = openai.Completion.create(
        model=COMPLETION_MODEL,
        prompt=question,
        max_tokens=150
    )["choices"][0]["text"].strip()

    completion = prompt + answer + '\r\n'
    print(completion)
    return completion

In [10]:
c1 = get_completion(q1)
c2 = get_completion(q2)


Question: Can you list all creature cards of type Dragon that were printed in 2024?
Answer:
As a AI, I don't have the data regarding future cards. Hence, I cannot provide a list of creature cards of type Dragon that were printed in 2024.


Question: It's 2025. What is the name of the last legendary sliver that was printed?
Answer:
Unfortunately, as a language model AI, I do not have access to information beyond the current year. I am unable to answer your question. Apologies for the inconvenience.



In [11]:
tokenizer = tiktoken.get_encoding("cl100k_base")

df = pd.read_csv("data/cards.csv")

def count_tokens(text):
    return len(tokenizer.encode(text))

df["token_count"] = df["text"].map(count_tokens)
df.head()

Unnamed: 0,text,release_date,set_code,card_id,token_count
0,\nName: Plague Rats\nColors: B\nManacost: {2}{...,1993-08-05,LEA,Plague Rats,74
1,\nName: Shivan Dragon\nColors: R\nManacost: {4...,1993-08-05,LEA,Shivan Dragon,67
2,\nName: Stone Giant\nColors: R\nManacost: {2}{...,1993-08-05,LEA,Stone Giant,83
3,\nName: Two-Headed Giant of Foriys\nColors: R\...,1993-08-05,LEA,Two-Headed Giant of Foriys,74
4,\nName: Uthden Troll\nColors: R\nManacost: {2}...,1993-08-05,LEA,Uthden Troll,59


In [12]:
df.describe()

Unnamed: 0,token_count
count,16807.0
mean,91.294044
std,22.885193
min,45.0
25%,74.0
50%,88.0
75%,107.0
max,217.0


In [13]:
batch_size = 1000
embeddings = []

for i in range(0, len(df), batch_size):
    # Send text data to OpenAI model to get embeddings
    batch = df.iloc[i:i+batch_size]
    print(f"Batch: {i}")

    response = openai.Embedding.create(
        input=batch["text"].tolist(),
        engine=EMBEDDING_MODEL
    )
    
    # Add embeddings to list
    embeddings.extend([data["embedding"] for data in response["data"]])

# Add embeddings list to dataframe
df["embeddings"] = embeddings
df

Batch: 0
Batch: 1000
Batch: 2000
Batch: 3000
Batch: 4000
Batch: 5000
Batch: 6000
Batch: 7000
Batch: 8000
Batch: 9000
Batch: 10000
Batch: 11000
Batch: 12000
Batch: 13000
Batch: 14000
Batch: 15000
Batch: 16000


Unnamed: 0,text,release_date,set_code,card_id,token_count,embeddings
0,\nName: Plague Rats\nColors: B\nManacost: {2}{...,1993-08-05,LEA,Plague Rats,74,"[0.009661226533353329, -0.012912601232528687, ..."
1,\nName: Shivan Dragon\nColors: R\nManacost: {4...,1993-08-05,LEA,Shivan Dragon,67,"[0.011478732340037823, -0.03415033966302872, -..."
2,\nName: Stone Giant\nColors: R\nManacost: {2}{...,1993-08-05,LEA,Stone Giant,83,"[0.017620408907532692, -0.03125821426510811, -..."
3,\nName: Two-Headed Giant of Foriys\nColors: R\...,1993-08-05,LEA,Two-Headed Giant of Foriys,74,"[0.00279824435710907, -0.011686686426401138, -..."
4,\nName: Uthden Troll\nColors: R\nManacost: {2}...,1993-08-05,LEA,Uthden Troll,59,"[0.006812881212681532, -0.039526745676994324, ..."
...,...,...,...,...,...,...
16802,"\nName: Evereth, Viceroy of Plunder\nColors: B...",2024-11-15,J25,"Evereth, Viceroy of Plunder",140,"[-0.01681125909090042, -0.03981684893369675, 0..."
16803,"\nName: Aphelia, Viper Whisperer\nColors: B\nM...",2024-11-15,J25,"Aphelia, Viper Whisperer",135,"[-0.01605450175702572, 0.002617432503029704, -..."
16804,"\nName: Taeko, the Patient Avalanche\nColors: ...",2024-11-15,J25,"Taeko, the Patient Avalanche",119,"[0.0005341354408301413, -0.014380116946995258,..."
16805,"\nName: Daretti, Rocketeer Engineer\nColors: R...",2025-02-14,DFT,"Daretti, Rocketeer Engineer",105,"[-0.0015830239281058311, -0.022208718582987785..."


In [14]:
df.to_csv("data/embeddings.csv", index=False)

In [15]:
import datetime
df["release_date"] = df["release_date"].map(pd.to_datetime)
df_s = df[df["release_date"] > datetime.datetime(2021, 1, 1)]
df_s.to_csv("data/embeddings_new_cards.csv", index=False)
df_s.head()

Unnamed: 0,text,release_date,set_code,card_id,token_count,embeddings
12084,\nName: Surtland Elementalist\nColors: U\nMana...,2021-02-05,KHM,Surtland Elementalist,100,"[0.006645162124186754, -0.0017845366382971406,..."
12085,"\nName: Lathril, Blade of the Elves\nColors: B...",2021-02-05,PKHM,"Lathril, Blade of the Elves",124,"[-0.012109244242310524, -0.018400194123387337,..."
12086,\nName: Cleaving Reaper\nColors: B\nManacost: ...,2021-02-05,KHM,Cleaving Reaper,94,"[-0.007777650840580463, -0.014852461405098438,..."
12087,\nName: Surtland Flinger\nColors: R\nManacost:...,2021-02-05,KHM,Surtland Flinger,111,"[0.0076932101510465145, -0.015223887749016285,..."
12088,\nName: Canopy Tactician\nColors: G\nManacost:...,2021-02-05,KHM,Canopy Tactician,71,"[-0.006393569055944681, -0.03395732492208481, ..."


In [16]:
from ast import literal_eval
df_s = pd.read_csv("data/embeddings_new_cards.csv")
df_s["embeddings"] = df_s["embeddings"].apply(literal_eval)
df_s.head()

Unnamed: 0,text,release_date,set_code,card_id,token_count,embeddings
0,\nName: Surtland Elementalist\nColors: U\nMana...,2021-02-05,KHM,Surtland Elementalist,100,"[0.006645162124186754, -0.0017845366382971406,..."
1,"\nName: Lathril, Blade of the Elves\nColors: B...",2021-02-05,PKHM,"Lathril, Blade of the Elves",124,"[-0.012109244242310524, -0.018400194123387337,..."
2,\nName: Cleaving Reaper\nColors: B\nManacost: ...,2021-02-05,KHM,Cleaving Reaper,94,"[-0.007777650840580463, -0.014852461405098438,..."
3,\nName: Surtland Flinger\nColors: R\nManacost:...,2021-02-05,KHM,Surtland Flinger,111,"[0.0076932101510465145, -0.015223887749016285,..."
4,\nName: Canopy Tactician\nColors: G\nManacost:...,2021-02-05,KHM,Canopy Tactician,71,"[-0.006393569055944681, -0.03395732492208481, ..."


In [18]:
from openai.embeddings_utils import get_embedding, distances_from_embeddings

def get_rows_sorted_by_relevance(question, df):
    # Get embeddings for the question text
    question_embeddings = get_embedding(question, engine=EMBEDDING_MODEL)
    
    df_copy = df.copy()
    df_copy["distances"] = distances_from_embeddings(
        question_embeddings,
        df_copy["embeddings"].values,
        distance_metric="cosine"
    )
    
    df_copy.sort_values("distances", ascending=True, inplace=True)
    return df_copy

In [19]:
p1_distances = get_rows_sorted_by_relevance(q1, df)
p1_distances.head()

Unnamed: 0,text,release_date,set_code,card_id,token_count,embeddings,distances
3612,\nName: Timeless Dragon\nColors: W\nManacost: ...,2002-06-24,PRM,Timeless Dragon,147,"[-0.0024310285225510597, -0.023831838741898537...",0.178517
13291,\nName: Draconic Muralists\nColors: G\nManacos...,2022-06-10,CLB,Draconic Muralists,89,"[-0.010951120406389236, -0.009599453769624233,...",0.181099
2870,\nName: Artificer's Dragon\nColors: \nManacost...,2002-06-24,PRM,Artificer's Dragon,125,"[0.014701064676046371, -0.02380487509071827, -...",0.182095
2966,"\nName: Dragonlord Dromoka\nColors: G, W\nMana...",2002-06-24,PRM,Dragonlord Dromoka,80,"[-0.0031070502009242773, -0.016883350908756256...",0.182899
13272,\nName: Chardalyn Dragon\nColors: \nManacost: ...,2022-06-10,CLB,Chardalyn Dragon,56,"[0.004365582950413227, -0.030372627079486847, ...",0.183069


In [20]:
p2_distances = get_rows_sorted_by_relevance(q2, df)
p2_distances.head()

Unnamed: 0,text,release_date,set_code,card_id,token_count,embeddings,distances
5727,"\nName: Sliver Legion\nColors: B, G, R, U, W\n...",2007-05-04,FUT,Sliver Legion,81,"[0.005937935318797827, -0.010013777762651443, ...",0.178894
5725,\nName: Sliversmith\nColors: \nManacost: {2}\n...,2007-05-04,FUT,Sliversmith,76,"[-0.008756960742175579, -0.005619939882308245,...",0.186844
8381,"\nName: Sliver Hivelord\nColors: B, G, R, U, W...",2014-07-18,M15,Sliver Hivelord,91,"[0.0015946932835504413, -0.0015330156311392784...",0.190061
1301,"\nName: Sliver Queen\nColors: B, G, R, U, W\nM...",1998-03-02,STH,Sliver Queen,79,"[0.0011914916103705764, -0.007616425398737192,...",0.190189
1247,\nName: Metallic Sliver\nColors: \nManacost: {...,1997-10-14,TMP,Metallic Sliver,46,"[-0.00648983009159565, -0.01784372888505459, -...",0.191153


In [48]:
PROMPT_TEMPLATE = """
The year is 2025. You are given <context> that contains card information from 2021 until 2025.
Answer the following <question> based on the given context that contains information of newly printed cards.
Answer "I don't know" if an answer cannot be found.

Context: 

{context}

---

Question: {question}
Answer:"""

def create_prompt(question, df):
    current_token_count = len(tokenizer.encode(PROMPT_TEMPLATE)) + \
                            len(tokenizer.encode(question))

    files_added = []
    context = []
    relevant_rows = get_rows_sorted_by_relevance(question, df)
    for idx, row in relevant_rows.iterrows():
        if (current_token_count + row["token_count"] <= COMPLETION_TOKEN_LIMIT):
            current_token_count += row["token_count"]
            context.append(row["text"])
        else:
            break

    return PROMPT_TEMPLATE.format(context="\n###\n".join(context), question=question)

test_prompt = create_prompt(q2, df)
print(len(tokenizer.encode(test_prompt)))

3887


In [50]:
print(test_prompt)


The year is 2025. You are given <context> that contains card information from 2021 until 2025.
Answer the following <question> based on the given context that contains information of newly printed cards.
Answer "I don't know" if an answer cannot be found.

Context: 


Name: Sliver Legion
Colors: B, G, R, U, W
Manacost: {W}{U}{B}{R}{G}
Type: Legendary Creature — Sliver
Text: All Sliver creatures get +1/+1 for each other Sliver on the battlefield.
Set: Future Sight (FUT)
Release date: 2007-05-04

###

Name: Sliversmith
Colors: 
Manacost: {2}
Type: Artifact Creature — Spellshaper
Text: {1}, {T}, Discard a card: Create a 1/1 colorless Sliver artifact creature token named Metallic Sliver.
Set: Future Sight (FUT)
Release date: 2007-05-04

###

Name: Sliver Hivelord
Colors: B, G, R, U, W
Manacost: {W}{U}{B}{R}{G}
Type: Legendary Creature — Sliver
Text: Sliver creatures you control have indestructible. (Damage and effects that say "destroy" don't destroy them.)
Set: Magic 2015 (M15)
Release d

In [51]:
def answer_question(question, df, max_answer_tokens=150):  
    prompt = create_prompt(question, df)

    try:
        response = openai.Completion.create(
            model=COMPLETION_MODEL,
            prompt=prompt,
            max_tokens=max_answer_tokens
        )
        return response["choices"][0]["text"].strip()
    except Exception as e:
        print(response)
        return ""

In [55]:
a1 = answer_question(q1, df)
print("Q: ", q1)
print("A custom: ", a1)
print("===")
print("A basic: ", c1)

Q:  Can you list all creature cards of type Dragon that were printed in 2024?
A custom:  Chardalyn Dragon, Bonehoard Dracosaur, Dragonhawk, Fate's Tempest, Dragon Trainer, The Powerful Dragon, Hoarding Dragon, Young Blue Dragon // Sand Augury, Astral Dragon, Volcanic Dragon, Brainstealer Dragon, Dragon Mage, Shimmer Dragon, Belltoll Dragon, Fledgling Dragon, and Ambitious Dragonborn were printed in 2024.
===
A basic:  
Question: Can you list all creature cards of type Dragon that were printed in 2024?
Answer:
As a AI, I don't have the data regarding future cards. Hence, I cannot provide a list of creature cards of type Dragon that were printed in 2024.



In [56]:
a2 = answer_question(q2, df)
print("Q: ", q2)
print("A custom: ", a2)
print("===")
print("A basic: ", c2)

Q:  It's 2025. What is the name of the last legendary sliver that was printed?
A custom:  The last legendary sliver that was printed was The Fifteenth Doctor.
===
A basic:  
Question: It's 2025. What is the name of the last legendary sliver that was printed?
Answer:
Unfortunately, as a language model AI, I do not have access to information beyond the current year. I am unable to answer your question. Apologies for the inconvenience.



# CONCLUSION

Before the context was given, the OpenAI model was incapable for answering, citing limited future information as the reason.

After the context was given, it was able to give an answer.
Even though the second answer is technically not correct.
We could probably improve this by:
- changing the way we build the context documents
- using few-shot inference in the prompt
- PEFT retraining of the base model