In [1]:
import os
import json
from pathlib import Path
import itertools

notebook_dir = Path(os.path.abspath('')).resolve()
os.chdir(notebook_dir)
print(f"Current working directory: {os.getcwd()}")

languages = ['en', 'de', 'es']
# If you want to add Swedish, add 'se' to the list above
# languages = ['en', 'de', 'es', 'se']

# All cross-language pairs
language_pairs = list(itertools.permutations(languages, 2))
# Add same-language pairs
language_pairs += [('en', 'en'), ('de', 'de'), ('es', 'es')]
# To add 'se-se', uncomment the next line if you have Swedish data:
# language_pairs.append(('se', 'se'))

print("Processing the following language pairs:")
for context_lang, question_lang in language_pairs:
    print(f"- Context: {context_lang}, Questions: {question_lang}")

base_dir = Path('dataset/MLQA_V1/dev')
questions_base = Path('questions')  # All JSONs go here
corpus_base = Path('corpus')        # Context markdowns go here

print(f"Looking for files in: {base_dir.absolute()}")
print(f"Base directory exists: {base_dir.exists()}")

for context_lang, question_lang in language_pairs:
    context_file = f'dev-context-{context_lang}-question-{question_lang}.json'
    json_path = base_dir / context_file

    print(
        f"\nProcessing {context_lang} contexts with {question_lang} questions")
    print(f"Reading from: {context_file}")

    if not json_path.exists():
        print(f'File not found: {json_path}')
        continue

    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    qa_pairs = []

    for entry in data.get('data', []):
        for para in entry.get('paragraphs', []):
            for qa in para.get('qas', []):
                question = qa.get('question', '').replace('\n', ' ').strip()
                answers = qa.get('answers', [])
                if answers and len(answers) > 0:
                    answer_text = answers[0].get('text', '').strip()
                    if question and answer_text:
                        qa_pairs.append({
                            'user_input': question,
                            'reference': answer_text,
                            'response': "",
                            'retrieved_contexts': ""
                        })

    # Write QA pairs for this language pair to the shared Questions folder
    questions_base.mkdir(parents=True, exist_ok=True)
    qa_file = questions_base / f'qa_pairs_{context_lang}-{question_lang}.json'
    with open(qa_file, 'w', encoding='utf-8') as f:
        json.dump(qa_pairs, f, ensure_ascii=False, indent=2)

    print(f'Extracted for {context_lang}-{question_lang}:')
    print(f'- {len(qa_pairs)} question-answer pairs saved to {qa_file}')

    # Write contexts - one file per title in the corpus/{pair} folder (no contexts subfolder)
    corpus_pair_dir = corpus_base / f"{context_lang}-{question_lang}"
    corpus_pair_dir.mkdir(parents=True, exist_ok=True)

    # Group contexts by title
    contexts_by_title = {}
    for entry in data.get('data', []):
        title = entry.get('title', 'Untitled').replace('\n', ' ').strip()
        for para in entry.get('paragraphs', []):
            context = para.get('context', '').replace('\n', ' ').strip()
            if context:
                if title not in contexts_by_title:
                    contexts_by_title[title] = []
                contexts_by_title[title].append(context)

    for title, contexts in contexts_by_title.items():
        # Create a safe filename from the title
        safe_title = "".join(c for c in title if c.isalnum()
                             or c in (' ', '-', '_')).rstrip()
        safe_title = safe_title.replace(' ', '_')
        context_file = corpus_pair_dir / f'{safe_title}.md'
        with open(context_file, 'w', encoding='utf-8') as f:
            # Concatenate all context paragraphs into a single string, separated by spaces
            all_text = " ".join(contexts).replace('\n', ' ').strip()
            f.write(f'{title}: {all_text}\n')

Current working directory: /Users/bdornauer/git-projects/rag-evaluation/lang-RAG-dataset
Processing the following language pairs:
- Context: en, Questions: de
- Context: en, Questions: es
- Context: de, Questions: en
- Context: de, Questions: es
- Context: es, Questions: en
- Context: es, Questions: de
- Context: en, Questions: en
- Context: de, Questions: de
- Context: es, Questions: es
Looking for files in: /Users/bdornauer/git-projects/rag-evaluation/lang-RAG-dataset/dataset/MLQA_V1/dev
Base directory exists: True

Processing en contexts with de questions
Reading from: dev-context-en-question-de.json
Extracted for en-de:
- 512 question-answer pairs saved to questions/qa_pairs_en-de.json

Processing en contexts with es questions
Reading from: dev-context-en-question-es.json
Extracted for en-es:
- 500 question-answer pairs saved to questions/qa_pairs_en-es.json

Processing de contexts with en questions
Reading from: dev-context-de-question-en.json
Extracted for de-en:
- 512 question-a