In [3]:
import json
import requests # to download some resources
import os # file operations
import numpy as np # linear algebra
import pandas as pd # data processing

In [4]:
# Consolidating answers from the whole sample
# Folder Path
folder_json = f"../data/4-summary-responses-json/version_yesno_updated"

# Lista para almacenar los DataFrames
dfs = []

# Recorrer todos los archivos en la carpeta
for file_name in os.listdir(folder_json):
    if file_name.endswith(".json"):  # Filtrar solo JSONs
        file_path = os.path.join(folder_json, file_name)  # Ruta completa del archivo
        
        # Cargar el JSON y convertirlo en DataFrame
        df = pd.read_json(file_path)  # Aquí estaba el error
        dfs.append(df)


# Concatenar todos los DataFrames en uno solo
combined_data = pd.concat(dfs, ignore_index=True)

# Collapse by country and question
combined_data = combined_data.groupby(['country', 'question'], as_index=False).agg({
    'answer': lambda x: ', '.join(x.dropna().astype(str).tolist()),
    'supporting_chunks': 'first',
    'answer_full': 'first',
    'supporting_text': 'first'
})

combined_data['question'] = combined_data['question'].astype(str)
combined_data['country'] = combined_data['country'].astype(str)
combined_data = combined_data.rename(columns={"answer": "answer_chunking"})
combined_data = combined_data[~combined_data['question'].isin(['period_start', 'period_end'])]

# Load the annotations
annotations_df = pd.read_excel('../data/5-annotations/annotations_new.xlsx')
annotations_df['question'] = annotations_df['question'].astype(str)
annotations_df['answer'] = annotations_df['answer'].astype(str)

# Joining with annotations
combined_data = combined_data.merge(annotations_df, how='inner', on=['country', 'question'], suffixes=('', '_right'))

output_path = f"../data/6-performance/combined_data_new.xlsx"  # Nombre del archivo Excel
combined_data.to_excel(output_path, index=False)

print(f"Archivo Excel guardado en: {output_path}")


Archivo Excel guardado en: ../data/6-performance/combined_data_new.xlsx


In [5]:
combined_data.head()

Unnamed: 0,country,question,answer_chunking,supporting_chunks,answer_full,supporting_text,answer,comment_input_for_protocol,supporting_text_when_true_answer_is_yes_but_prediction_is_no
0,australia,1,yes,[the 2020 Strategy and OHMAP emphasise the imp...,yes,The NAP emphasises the importance of tackling ...,yes,,
1,australia,2,yes,[The 2020 Strategy focuses on seven Objectives...,yes,The NAP includes specific actions in the human...,yes,,
2,australia,3,yes,"[Develop, implement and/or update national bio...",yes,The NAP includes specific actions targeting an...,yes,,
3,australia,4,yes,[Create a sustainably funded national One Heal...,yes,The NAP includes actions related to environmen...,yes,,
4,australia,5,yes,"[Develop, implement and/or update national bio...",yes,The NAP includes actions aimed at improving ag...,yes,,
