In [1]:
import json
import ir_measures
from pyserini.search.lucene import LuceneSearcher
from bs4 import BeautifulSoup as bs
import tqdm

In [2]:
dataset = []
# Load the base data.
with open('../../data/datasets/cast/year_4/annotated_topics.json', 'r') as f:
    annotated_topics = json.load(f)
# Load the qrels.
qrels = list(ir_measures.read_trec_qrels(
    '../../data/datasets/cast/year_4/cast2022.qrel'))
# load the searcher.
searcher = LuceneSearcher('../../data/indexes/sparse/cast/trecweb_index')

In [3]:
# Parse original data.
def get_text(doc_id: str):
    """Returns the passage text for a given doc_id."""
    doc_id, passage_id = doc_id.rsplit("-", 1)
    document = searcher.doc(doc_id).raw()
    document = bs(document, "lxml")
    passages = document.find_all("passage")
    for idx, passage in enumerate(passages):
        if idx == int(passage_id):
            return passage.text
        
stash = dict()
parsed_turns = set()
for topic in tqdm.tqdm(annotated_topics):
    for index, turn in enumerate(topic['turn']):
        turn_id = f"{topic['number']}_{turn['number']}"
        if turn_id in parsed_turns:
            continue
        information_need = turn.get("information_need")
        utterances = []

        for previous_turn in topic['turn'][:index]:
            utterances.append({
                'User': previous_turn.get("utterance"),
                'System': previous_turn.get("response")
            })
        utterances.append({'User': turn.get("utterance")})
        
        passage_ids_with_relevance = [
            {'passage_id': qrel.doc_id, 'relevance': qrel.relevance} for qrel in qrels if qrel.query_id == turn_id]
        passage_texts = [
            get_text(item['passage_id']) for item in passage_ids_with_relevance]
        
        stash[turn_id] = {
            'information_need': information_need,
            'passages': passage_texts,
            'passage_ids_with_relevance': passage_ids_with_relevance
        }
        
        dataset.append({
            'information_need': information_need,
            'utterances': utterances,
            'passages': passage_texts,
            'passage_ids_with_relevance': passage_ids_with_relevance
        })

100%|██████████| 50/50 [02:10<00:00,  2.61s/it]


In [4]:
import pathlib
import re
import hashlib
# Parse generated data.
def load_cast_convsim_conversations(
        path: str = '../../data/transcripts/convsim_outputs'
    ):
    """Loads the conversations for training.
    
    Args:
        path: Path to the conversations shelve db.
    """
    seen_conversations = set()

    for directory in tqdm.tqdm(pathlib.Path(path).iterdir()):
        for subdirectory in directory.iterdir():
            if not subdirectory.is_dir():
                continue
            for file in subdirectory.iterdir():
                # Get the file basename
                basename = file.name.split('.')[0]
                match = re.match(r'^\d+_\d+-\d+$', basename)
                if match:
                    basename = match.string
                else:
                    continue

                with open(file, 'r') as f:
                    try:
                        conversation = json.load(f)
                    except json.decoder.JSONDecodeError:
                        continue
                # Count feedback turns.
                feedback_turns = [
                    turn for turn in conversation if turn['type'] == 'feedback']
                long_feedback_turns = False
                # Check the content of the feedback turns
                for turn in feedback_turns:
                    if len(turn['utterance'].split()) > 25:
                        long_feedback_turns = True
                    
                if len(feedback_turns) > 2 or len(feedback_turns) == 0 or long_feedback_turns:
                    continue

                # Get the hash of the conversation
                conversation_hash = hashlib.sha256(
                    json.dumps(conversation).encode('utf-8')).hexdigest()
                
                if conversation_hash in seen_conversations:
                    continue
                seen_conversations.add(conversation_hash)

                # Parse conversations.
                parsed_conversation = []
                current_turn = {}
                for turn in conversation:
                    if turn['participant'] == "User":
                        current_turn['User'] = turn['utterance']
                    else:
                        current_turn['System'] = turn['utterance']
                        parsed_conversation.append(current_turn)
                        current_turn = {}

                dataset.append({
                    'information_need': stash[basename]['information_need'],
                    'utterances': parsed_conversation,
                    'passages': stash[basename]['passages'],
                    'passage_ids_with_relevance': stash[basename]['passage_ids_with_relevance']
                })

In [5]:
load_cast_convsim_conversations()

172it [01:42,  1.68it/s]


In [9]:
count = 0
for item in dataset:
    count += len(item['passages'])
print(count)

2699924
