In [2]:
import csv
import gzip
import os
from collections import defaultdict

# Define the path
path = os.getcwd()
# path = os.path.join(current_path, "..", "data", "IR")

# Paths to files
queries_file = os.path.join(path, "msmarco-doctrain-queries.tsv")
rankings_file = os.path.join(path, "msmarco-doctrain-top100")
docs_lookup_file = os.path.join(path, "msmarco-docs-lookup.tsv")



In [3]:

# Step 1: Load Queries

query_dict = {}
with open( queries_file, 'rt', encoding='utf8') as f:
    tsvreader = csv.reader(f, delimiter="\t")
    for [qid, query] in tsvreader:
        query_dict[qid] = query
print(len(query_dict))

# Step 2: Load Rankings
ranking_dict = defaultdict(list)
with open(rankings_file, "rt", encoding="utf-8") as f:
    for line in f:
        qid, _, docid, rank, score, _ = line.split(" ")
        ranking_dict[qid].append((int(rank), docid,score))

print(len(ranking_dict))

367013


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1117b7890>>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


367012


In [4]:
# Optional Step: Load Document Offsets (if document content is needed)
doc_offsets = {}
with open(docs_lookup_file, "r", encoding="utf-8") as f:
    tsvreader = csv.reader(f, delimiter="\t")
    for docid, _, offset in tsvreader:
        doc_offsets[docid] = int(offset)
        

In [34]:
docs_file = os.path.join(path, "msmarco-docs.tsv")
def get_doc_content(docid, file_path):
    """
    Retrieve the document content using offsets from a TSV file.
    Handles lines with unexpected formats gracefully.
    """
    with open(file_path, "rt", encoding="utf-8") as f:
        try:
            # Move the file pointer to the position specified by the offset for this document ID
            f.seek(doc_offsets[docid])

        
            # Read the line corresponding to the document content
            line = f.readline()
            
            # Split the line into components: docid, url, title, and body
            parts = line.split("\t", maxsplit=3)
            if len(parts) != 4:
                raise ValueError(f"Malformed line: {line}")

            docid, url, title, body = parts
        

            # Return the components as a tuple
            return body
        except Exception as e:
            print(f"Error processing docid {docid}: {e}")
            return None  # Return None or a default value for malformed lines


# load all rank and id

In [9]:

# Step 4: Write Ranked Output
output_file = os.path.join(path, "ranked_output.tsv")
with open(output_file, "w", encoding="utf-8") as out:
    tsvwriter = csv.writer(out, delimiter="\t")
    
    for qid, ranked_docs in ranking_dict.items():

        ranked_docs.sort()  # Ensure ranks are in order

        row = [qid, query_dict[qid]]  # Start row with qid and query
        for rank, docid,score in ranked_docs:
            row.extend([docid, rank])  # Add docid and rank
        tsvwriter.writerow(row)

# Load top 5 rank and id

In [10]:
# Step 4: Write Ranked Output
limit = 5
output_file = os.path.join(path, "ranked_output_limited.tsv")
with open(output_file, "w", encoding="utf-8") as out:
    tsvwriter = csv.writer(out, delimiter="\t")
    
    for qid, ranked_docs in ranking_dict.items():

        ranked_docs.sort()  # Ensure ranks are in order

        row = [qid, query_dict[qid]]  # Start row with qid and query
        i = 0
        for rank, docid,score in ranked_docs:
            if i >= 5:
                break
            row.extend([rank,docid])  # Add docid and rank
            
            i+= 1
        tsvwriter.writerow(row)

# Load all top 3 rank and doc

In [23]:
from tqdm import tqdm

# Limit the number of results
limit = 5
output_file = os.path.join(path, "ranked_output_content.tsv")

with open(output_file, "w", encoding="utf-8") as out:
    tsvwriter = csv.writer(out, delimiter="\t")
    
    # Wrap ranking_dict.items() with tqdm for a progress bar
    for qid, ranked_docs in tqdm(ranking_dict.items(), desc="Processing Queries", unit="query"):
        
        ranked_docs.sort()  # Ensure ranks are in order

        row = [qid, query_dict[qid]]  # Start row with qid and query
        i = 0
        for rank, docid, score in ranked_docs:
            if i >= limit:
                break

            doc_content = get_doc_content(docid, docs_file)

            row.extend([rank, doc_content])  # Add rank and document content

            i += 1

        tsvwriter.writerow(row)


Processing Queries: 100%|███████████| 367012/367012 [12:29<00:00, 489.44query/s]


# Output FIRST format:

In [35]:
import random
def get_Conversations(query,docs):
    """
    Generate a conversation structure for ranking passages.

    Parameters:
        docs (list): List of tuples where each tuple contains an id (string) and content (string).

    Returns:
        list: A structured conversation with system, human, and GPT messages.
    """
    output = []

    # Add system message
    output.append({
        "from": "system",
        "value": "You are RankLLM, an intelligent assistant that can rank passages based on their relevancy to the query."
    })

    # Shuffle document IDs
    idslist = list(range(len(docs)))
    random.shuffle(idslist)
  
    # Add AI's response (placeholder for ranking)
    sample_ranking = " > ".join([f"[{ids}]" for ids in idslist])  # Generate a sample ranking

    # Create the passages
    passages = [f"[{idslist[i]}] {content}" for i, (content) in enumerate(docs)]

    # Shuffle the passages
    random.shuffle(passages)

    # Create the human message
    human_message = (
        f"I will provide you with {len(passages)} passages, each indicated by a numerical identifier []. "
        f"Rank the passages based on their relevance to the search query: {query}.\n\n"
    )
    human_message += "\n".join(passages)
    human_message += (
        f"\n\nSearch Query: {query}\nRank the {len(passages)} passages above based on their relevance to the search query. "
        "All the passages should be included and listed using identifiers, in descending order of relevance. "
        "The output format should be [] > [], e.g., [2] > [1], Only respond with the ranking results, do not say any word or explain."
    )

    output.append({
        "from": "human",
        "value": human_message
    })

    
    output.append({
        "from": "gpt",
        "value": sample_ranking
    })

    return output


In [37]:
import os
import json
import uuid
from tqdm import tqdm

# Path and file name
output_file = os.path.join(path, "ranked_first.jsonl")

# Limit the number of results
limit = 5

# Assuming ranking_dict and query_dict are predefined
# Wrap ranking_dict.items() with tqdm for a progress bar
with open(output_file, 'w') as f:
    for qid, ranked_docs in tqdm(ranking_dict.items(), desc="Processing Queries", unit="query"):
        
        ranked_docs.sort()  # Ensure ranks are in order

        # Limit the number of documents
        i = 0
        list_doc =[]
        
        for rank, docid, score in ranked_docs:
            if i >= limit:
                break
            doc_content = get_doc_content(docid, docs_file)  # Function to get document content
 
            i += 1

            list_doc.append(doc_content)
            
        
        # Generate conversation data
        conversation = get_Conversations(query_dict[qid], list_doc)  # Adjust as necessary
   
        # Generate a random string ID for the query
        random_id = str(uuid.uuid4())  # Generate a unique ID for this query

        # Initialize the row with the random ID, query id, and actual query
        row = {
            "id": random_id,  # Random ID
            "conversation": conversation
        }
   
        # Write the row to the JSONL file
        f.write(json.dumps(row) + "\n")


Processing Queries: 100%|███████████| 367012/367012 [07:59<00:00, 765.78query/s]


# Read file

In [9]:
import os
import json

# Define the file path
output_file = os.path.join(path, "ranked_first.jsonl")

# Read only the top 5 lines from the JSONL file
top_5 = []
with open(output_file, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
     
        if i >= 5:  # Stop after 5 lines
            break
        top_5.append(json.loads(line))

# Now `top_5` contains only the first 5 dictionaries from the file


{'id': '9d2fef94-23b8-416b-a4c5-cb7ab35ea930', 'conversation': [{'from': 'system', 'value': 'You are RankLLM, an intelligent assistant that can rank passages based on their relevancy to the query.'}, {'from': 'human', 'value': 'I will provide you with 5 passages, each indicated by a numerical identifier []. Rank the passages based on their relevance to the search query: feeding rice cereal how many times per day.\n\n[1] "Home > Health Library Feeding Guide for the First Year<< Back to Pediatrics Making appropriate food choices for your baby during the first year of life is very important. More growth occurs during the first year than at any other time in your child\'s life. It\'s important to feed your baby a variety of healthy foods at the proper time. Starting good eating habits at this early stage will help set healthy eating patterns for life. Recommended feeding guide for the first year Don\'t give solid foods unless your child\'s healthcare provider advises you to do so. Solid fo