In [35]:
import pandas as pd

df = pd.read_csv("results/results_12_2_query.csv")
df_test = pd.read_csv("data/test.csv")
test_qids = [8, 15, 29, 39, 43]
df_selected = df[df["qid"].isin(test_qids)]

In [36]:
ranker = {}
for qid in test_qids:
    a = df_selected[(df_selected["qid"] == qid) & (df_selected["model"] == "pipeline_6")]
    ranker[qid] = a.title.values

In [37]:
df_selected.model.unique()

array(['pipeline_0', 'pipeline_1', 'pipeline_2', 'pipeline_3',
       'pipeline_4', 'pipeline_5', 'pipeline_6'], dtype=object)

### Prompt generation

In [38]:
movie_df = pd.read_csv("data/movie_df.csv", sep="\t")
movie_df.head()

Unnamed: 0,id,data,title
0,1,The movie Toy Story (1995) belongs to the foll...,Toy Story (1995)
1,2,The movie Jumanji (1995) belongs to the follow...,Jumanji (1995)
2,3,The movie Grumpier Old Men (1995) belongs to t...,Grumpier Old Men (1995)
3,4,The movie Waiting to Exhale (1995) belongs to ...,Waiting to Exhale (1995)
4,5,The movie Father of the Bride Part II (1995) b...,Father of the Bride Part II (1995)


In [39]:
prompt_template = """I will provide you with {num} movies, each indicated by a numerical identifier [].
Rank the movies based on their relevance to the search query: {query}. Here are the movies that you need to rank:
{movie_infos}
Please rank the {num} movies above based on their relevance to the search query. 
All the movies should be included and listed using identifiers, in descending order of relevance. 
The output format should be [] > [] > ... > [], e.g., [4] > [2] > [3] > [1] > [5] if 5 movies are given. 
Only respond with the ranking results, do not say any word or explain the reason for the ranking and keep the answer
as short as possible.
"""

In [40]:
prompts = []

for qid, movies in ranker.items():
    query = df_test[df_test["qid"] == qid].iloc[0]["query"]
    movie_infos = ""
    for i, movie in enumerate(movies):
        movie_info = movie_df[movie_df["title"] == movie].iloc[0]
        movie_infos += f"[{i+1}] {movie_info['title']}\n"
    prompt = prompt_template.format(num=len(movies), query=query, movie_infos=movie_infos)
    prompts.append(prompt)

with open("prompts.txt", "w") as f:
    f.write("\n\n".join(prompts))    

### OpenAI Response

In [41]:
# from openai import OpenAI

# client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")

# chat_answers = []
# for prompt in prompts:
#     completion = client.chat.completions.create(
#         model="local-model", # this field is currently unused
#         messages=[
#             {"role": "system", "content": "Suppose you are a movie critic and you are asked to rank a list of movies based on their relevance to a search query."},
#             {"role": "user", "content": prompt},
#         ],
#         temperature=0.7,
#     )
#     chat_answers.append(completion.choices[0].message.content)

# with open("chat_answers.txt", "w") as f:
#     f.write("\n\n\n".join(chat_answers))


In [42]:
with open("gpt4_answer.txt", "r") as f:
    chat_answers = f.read().split("\n\n\n")

### Analysis of LLM

In [43]:
import re

ranker1 = {}
for qid, answer in zip(test_qids, chat_answers):
    text = answer.split("\n")[0]
    matches = re.findall(r"\[(.*?)\]", text)
    new_list = []
    for x in matches:
        try:
            new_list.append(ranker[qid][int(x)-1])
        except:
            match = [y for y in ranker[qid] if y[0] == x[:4]]
            new_list.append(match[0] if len(match) else "Not found")
            # print(x, match)
    ranker1[qid] = [movie_df[movie_df.title == x].id.values[0] for x in new_list]

In [44]:
ranker1

{8: [167832, 22, 97304, 3499, 3798, 5266, 1219, 1982, 1387, 213347],
 15: [953, 6936, 586, 317, 2423, 2804, 41573, 208939, 117887, 8607],
 29: [27397,
  92243,
  141890,
  53342,
  91880,
  165431,
  54607,
  61434,
  158272,
  127315],
 39: [3916, 1954, 72641, 33660, 524, 7263, 31225, 261131, 2082, 34153],
 43: [134130, 26124, 3981, 3354, 4942, 166526, 161592, 247150, 26398, 2662]}

### Relevence Helper

In [45]:
import numpy as np
import pandas as pd
from tqdm import tqdm


def map_score(search_result_relevances: list[int], cut_off=10) -> float:
    print(search_result_relevances)
    all_rel_doc = np.sum([1 if rel > 0 else 0 for rel in search_result_relevances])
    correct_pos = [pos for pos in range(cut_off) if search_result_relevances[pos] > 0]
    precision_k = [(i + 1) / (pos + 1) for i, pos in enumerate(correct_pos)]
    return np.sum(precision_k) / 10 if len(precision_k) > 0 else 0


def ndcg_score(search_result_relevances: list[float], 
               ideal_relevance_score_ordering: list[float], cut_off=10):
    print(search_result_relevances)
    actual_len = min(len(search_result_relevances), cut_off)
    dcg = search_result_relevances[0] + np.sum([gain / np.log2(pos + 2) for pos, gain in enumerate(search_result_relevances[1:actual_len])]) if actual_len != 0 else 0
    ideal_len = min(len(ideal_relevance_score_ordering), cut_off)
    idcg = ideal_relevance_score_ordering[0] +np.sum([gain / np.log2(pos + 2) for pos, gain in enumerate(ideal_relevance_score_ordering[1:ideal_len])]) if ideal_len != 0 else 0
    return dcg / idcg if idcg > 0 else 0


def run_relevance_tests(relevance_data_filename: str, ranker) -> dict[str, float]:
    relevance_df = pd.read_csv(relevance_data_filename)
    map_scores = []
    ndcg_scores = []

    for qid, relevance_doc in tqdm(relevance_df.groupby('qid')):
        doc_to_rel = {}
        for _, row in relevance_doc.iterrows():
            doc_to_rel[row['docid']] = row['rel']
        result_doc = ranker[qid]
        result_rel = [doc_to_rel.get(result, 1) for result in result_doc]
        ideal_rel = sorted(doc_to_rel.values(), reverse=True)
        map_result_rel = list(map(lambda x: 1 if x > 3 else 0, result_rel))
        map_scores.append(map_score(map_result_rel))
        ndcg_scores.append(ndcg_score(result_rel, ideal_rel))
    # TODO: Compute the average MAP and NDCG across all queries and return the scores. 
    map_avg_score = np.mean(map_scores)
    ndcg_avg_score = np.mean(ndcg_scores)
    # 3: Return the scores.
    return {'map': map_avg_score, 'ndcg': ndcg_avg_score, 'map_scores': map_scores, 'ndcg_scores': ndcg_scores}

### Evaluation

In [46]:
a = run_relevance_tests("data/test.csv", ranker1)
a

100%|██████████| 5/5 [00:00<00:00, 1074.53it/s]

[1, 1, 1, 1, 0, 0, 1, 0, 0, 0]
[5, 4, 4, 4, 3, 3, 5, 2, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[5, 5, 5, 5, 5, 5, 5, 5, 5, 4]
[1, 0, 0, 0, 0, 0, 0, 1, 0, 0]
[4, 3, 3, 1, 1, 1, 3, 4, 1, 2]
[1, 1, 1, 1, 1, 0, 1, 0, 1, 0]
[5, 4, 4, 4, 4, 1, 5, 1, 4, 1]
[1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
[5, 4, 3, 3, 1, 3, 1, 1, 3, 1]





{'map': 0.491984126984127,
 'ndcg': 0.8054725683587982,
 'map_scores': [0.4714285714285714, 1.0, 0.125, 0.6634920634920635, 0.2],
 'ndcg_scores': [0.7519161237423574,
  0.9885419998064646,
  0.6863756975224844,
  0.7526720307263396,
  0.8478569899963447]}

In [47]:
for func in [np.mean, np.std, np.min, np.max]:
    print(func(a['ndcg_scores']))

0.8054725683587982
0.10502486382314746
0.6863756975224844
0.9885419998064646


In [25]:
# for qid in ranker:
#     ranker[qid] = [movie_df[movie_df.title == x].id.values[0] for x in ranker[qid]]

run_relevance_tests("data/test.csv", ranker)

100%|██████████| 5/5 [00:00<00:00, 1038.40it/s]

[1, 0, 0, 0, 1, 1, 1, 0, 1, 0]
[5, 3, 2, 1, 4, 4, 5, 1, 4, 3]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[5, 5, 5, 5, 5, 5, 5, 5, 5, 4]
[1, 0, 0, 0, 0, 1, 0, 0, 0, 0]
[4, 2, 3, 3, 1, 4, 1, 1, 1, 3]
[1, 1, 1, 1, 1, 0, 1, 0, 0, 1]
[5, 4, 4, 4, 4, 1, 5, 1, 1, 4]
[1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
[5, 4, 1, 3, 3, 3, 1, 3, 1, 1]





{'map': 0.4583492063492064,
 'ndcg': 0.7850475744081123,
 'map_scores': [0.3026984126984127,
  1.0,
  0.13333333333333333,
  0.6557142857142857,
  0.2],
 'ndcg_scores': [0.683629066933712,
  0.9885419998064646,
  0.6736511294142714,
  0.7509841338979839,
  0.8284315419881301]}