In [5]:
import json
import os

res_file = "../attempts/qwen-2.5/results.json"  # The path of the prediction file (results.json)
ref_file = "../attempts/qwen-2.5/results.json"  # The path of the Ground True file (reference.json)
output_dir = ""  # The path of the output directory
scores_file = os.path.join(output_dir, "scores.json")  # The path of the output file (scores.json)

# Counters
EI_tp = 0
EI_gold_len = 0
EI_pred_len = 0
EC_tp = 0
EC_gold_len = 0
EC_pred_len = 0
RE_GEN_tp = 0
RE_STRICT_tp = 0
RE_gold_len = 0
RE_pre_len = 0
cnt = 0

def safe_div(a, b):
    return round(a / b * 100, 2) if b != 0 else 0.0

def safe_div_(a, b):
    return round(a / b, 2) if b != 0 else 0.0

def compute_f1(cnt, tp, pred_num, gold_num):
    return {
        "Total samples": cnt,
        "P": safe_div(tp, pred_num),
        "R": safe_div(tp, gold_num),
        "F1": safe_div_(2 * safe_div(tp, pred_num) * safe_div(tp, gold_num), safe_div(tp, pred_num) + safe_div(tp, gold_num))
    }

# Load Ground Truth
with open(ref_file, "r", encoding="utf-8") as gt_file:
    Ground_True = json.load(gt_file)

GT = {}
total_gt_entities = 0
total_gt_triples = 0

for doc_id, sample in Ground_True.items():
    mention_gt = sample["entities"]
    total_gt_entities += len(mention_gt)
    triple_gt = sample["triples"]
    total_gt_triples += len(triple_gt)
    
    mentions_gt_list = [set(m["mentions"]) for m in mention_gt]
    metion_type_list = [m["type"] for m in mention_gt]
    triple_gt_list = [(gt["head"], gt["relation"], gt["tail"]) for gt in triple_gt]
    
    GT[doc_id] = {"mentions_GT": mentions_gt_list, "relations_GT": triple_gt_list, "mention_type": metion_type_list}

del Ground_True

# Load Predictions
with open(res_file, "r", encoding="utf-8") as f:
    results = json.load(f)
    cnt = len(results)

total_pred_entities = 0
total_pred_triples = 0

for pre_id, sample in results.items():
    mention_gt_list = GT[pre_id]["mentions_GT"]
    relation_gt_list = GT[pre_id]["relations_GT"]
    type_gt_list = GT[pre_id]["mention_type"]
    
    mention_pred = sample["entities"]
    total_pred_entities += len(mention_pred)
    for i in range(len(mention_pred)):
        mention_pred[i]["mentions"] = set(mention_pred[i]["mentions"])
    
    EI_gold_len += len(mention_gt_list)
    EC_gold_len += len(mention_gt_list)
    EI_pred_len += len(mention_pred)
    EC_pred_len += len(mention_pred)

    for i in range(len(mention_pred)):
        for j in range(len(mention_gt_list)):
            if mention_pred[i]["mentions"] in mention_gt_list:
                EI_tp += 1
                type_idx = mention_gt_list.index(mention_pred[i]["mentions"])
                if mention_pred[i]["type"] == type_gt_list[type_idx]:
                    EC_tp += 1
                    break

    triple_pred = sample["triples"]
    total_pred_triples += len(triple_pred)
    RE_pre_len += len(triple_pred)
    RE_gold_len += len(relation_gt_list)

    for pred in triple_pred:
        pred_triple = (pred["head"], pred["relation"], pred["tail"])
        if pred_triple in relation_gt_list:
            RE_GEN_tp += 1
            RE_STRICT_tp += 1

# Print entity and triple counts
print(f"Ground Truth: {total_gt_entities} entities, {total_gt_triples} triples",ref_file)
print(f"Predictions: {total_pred_entities} entities, {total_pred_triples} triples",res_file)

# Compute F1 Scores
entity_identification_res = compute_f1(cnt, EI_tp, EI_pred_len, EI_gold_len)
entity_classification_res = compute_f1(cnt, EC_tp, EC_pred_len, EC_gold_len)
re_general_res = compute_f1(cnt, RE_GEN_tp, RE_pre_len, RE_gold_len)
re_strict_res = compute_f1(cnt, RE_STRICT_tp, RE_pre_len, RE_gold_len)

# Save results
with open(scores_file, "w", encoding="utf-8") as f:
    json.dump({
        "entity_ident": entity_identification_res["F1"],
        "entity_cla": entity_classification_res["F1"],
        "re_general": re_general_res["F1"],
        "re_strict": re_strict_res["F1"]
    }, f, ensure_ascii=False, indent=4)


KeyError: 'type'

In [8]:
import json

# Read both JSON files
with open('ensemble1/results.json', 'r') as f1, open('rel.json', 'r') as f2:
    data1 = json.load(f1)  # Source of entities (single dictionary)
    data2 = json.load(f2)  # Target to receive entities (single dictionary)

for key, value in data1.items():
    if "entities" in value:
        if data2.get(key):
            if "entities" not in data2[key]:
                data2[key]["entities"] = value["entities"]

with open("rel.json", "w", encoding="utf-8") as f:
    json.dump(data2, f, ensure_ascii=False, indent=4)



In [27]:
import pandas as pd
import json
import ast  # Import for safer JSON parsing

# Load dataset
data_test = pd.read_csv("val_final_processed.csv")
results = {}
# Iterate over the DataFrame
for i, row in data_test.iterrows():
    # Convert string to JSON (handle single & double quotes)
    sample = ast.literal_eval(row["output"].strip())  # More robust than json.loads
    # save the sample to a json file, each sample in a new line, seperate by comma
    results.update(sample)

with open("val.json", "a") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)
    


In [3]:
import json

def compare_json_structure(json1, json2):
    if type(json1) != type(json2):
        return False
    
    if isinstance(json1, dict):
        if json1.keys() != json2.keys():
            return False
        for key in json1.keys():
            if not compare_json_structure(json1[key], json2[key]):
                return False
    elif isinstance(json1, list):
        if len(json1) != len(json2):
            return False
        for item1, item2 in zip(json1, json2):
            if not compare_json_structure(item1, item2):
                return False
    return True

def check_json_files(file1, file2):
    try:
        with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2:
            json1 = json.load(f1)
            json2 = json.load(f2)
            return compare_json_structure(json1, json2)
    except Exception as e:
        print(f"Error reading files: {e}")
        return False

# Example usage
file1 = 'val_result.json'  # Replace with your val_result2.json file path
file2 = 'val.json'           # Replace with your val.json file path

if check_json_files(file1, file2):
    print("The JSON files have the same structure.")
else:
    print("The JSON files do not have the same structure.")

The JSON files do not have the same structure.


In [14]:
import json

def check_dict_in_json(file_path):
    try:
        # Load the JSON data from the file
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(len(data))
        # Check if the loaded data is a dictionary
        if isinstance(data, dict):
            # Check if any value in the dictionary is a string
            for key, value in data.items():
                if isinstance(value, str):
                    print(f"The value for key '{key}' is a string: {value}")
                    return False
            print("The JSON file contains a dictionary with no string values.")
            return True
        else:
            print("The JSON file does not contain a dictionary.")
            return False
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return False

# Example usage
file_path = 'qwen-2.5/results.json'  # Replace with your JSON file path
check_dict_in_json(file_path)

248
The JSON file contains a dictionary with no string values.


True

In [1]:
import json

# Load the data
data = json.load(open('1/results.json', 'r', encoding='utf-8'))

# Process the data
for key, value in data.items():
    if 'entities' in value:
        entities = value['entities']
        # Ensure entities is a list
        if not isinstance(entities, list):
            entities = [entities]
        
        processed_entities = []
        for item in entities:
            # If item is a string, convert it to a dictionary with a 'mentions' key
            if isinstance(item, str):
                processed_entities.append({'mentions': item})
            # If item is a dictionary, ensure it has a 'mentions' key
            elif isinstance(item, dict):
                if 'mentions' not in item:
                    item['mentions'] = ''  # or some default value
                processed_entities.append(item)
            # If item is a list, handle each element (you might need to adjust this based on your data structure)
            elif isinstance(item, list):
                processed_entities.extend([{'mentions': x} if isinstance(x, str) else x for x in item])
        
        # Update the entities with processed data
        value['entities'] = processed_entities

# Save the processed data to a new JSON file
with open('qwen-2.5/results.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("Data processing complete. Saved to processed_results.json")

Data processing complete. Saved to processed_results.json


In [None]:
def get_mention_list(head_entity, mtlist):
    """
    head_entity: Could be a string or a list of strings
    mtlist: List of mentions to check against
    """
    # Convert head_entity to a list if it isn't already
    if not isinstance(head_entity, list):
        head_entity = [head_entity]
    
    # Convert mtlist to a set for faster lookups if it's large
    mtlist_set = set(mtlist)
    
    # Find all mentions that match any of the head_entity items
    matches = []
    for entity in head_entity:
        if entity in mtlist_set:
            matches.append(entity)
    
    return matches

In [7]:
import json
import os
from collections import defaultdict

def read_json_file(file_path):
    """Reads a JSON file and returns its content, handling errors."""
    if not os.path.exists(file_path):
        print(f"Warning: File not found - {file_path}. Skipping.")
        return None
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            if not isinstance(data, dict):
                print(f"Warning: File content is not a dictionary - {file_path}. Skipping.")
                return None
            return data
    except json.JSONDecodeError:
        print(f"Warning: Invalid JSON format - {file_path}. Skipping.")
        return None
    except Exception as e:
        print(f"Warning: Error reading file {file_path}: {str(e)}. Skipping.")
        return None

def combine_documents(input_files):
    """Combines entities and triples for each document ID across multiple files, deduplicating within each document."""
    combined_docs = defaultdict(lambda: {
        'title': None, 
        'entities': set(),  # Store hashable representations for deduplication
        'triples': set()    # Store hashable representations for deduplication
    })
    valid_input_files = []
    processed_doc_ids = set()

    print("Processing input files...")
    for file_path in input_files:
        data = read_json_file(file_path)
        if data:
            print(f"  - Processing {os.path.basename(file_path)}")
            valid_input_files.append(os.path.basename(file_path))
            for doc_id, doc in data.items():
                processed_doc_ids.add(doc_id)
                
                # Store title (use first encountered)
                if combined_docs[doc_id]['title'] is None:
                    combined_docs[doc_id]['title'] = doc.get('title', f'Title not found in source files for {doc_id}')

                # Add entities (deduplicated by set)
                entities = doc.get('entities', [])
                for entity in entities:
                    try:
                        # Use frozenset for mentions to handle order difference
                        mentions_key = frozenset(entity.get('mentions', []))
                        entity_type = entity.get('type', '')
                        entity_key = (mentions_key, entity_type)
                        combined_docs[doc_id]['entities'].add(entity_key)
                    except TypeError:
                        print(f"    Warning: Skipping entity in doc {doc_id} due to unhashable mention: {entity.get('mentions')}")
                
                # Add triples (deduplicated by set)
                triples = doc.get('triples', [])
                for triple in triples:
                    try:
                        head = triple.get('head', '')
                        relation = triple.get('relation', '')
                        tail = triple.get('tail', '')
                        triple_key = (head, relation, tail)
                        combined_docs[doc_id]['triples'].add(triple_key)
                    except TypeError:
                        print(f"    Warning: Skipping triple in doc {doc_id} due to unhashable part: {triple}")

    # Convert sets back to lists of dictionaries for the final output
    final_output_data = {}
    total_entities_after = 0
    total_triples_after = 0
    print("\nConsolidating results...")
    for doc_id, combined_info in combined_docs.items():
        final_entities = []
        for entity_key in combined_info['entities']:
            mentions_set, entity_type = entity_key
            final_entities.append({
                'mentions': sorted(list(mentions_set)),  # Store consistently sorted
                'type': entity_type
            })
        
        final_triples = []
        for triple_key in combined_info['triples']:
            head, relation, tail = triple_key
            final_triples.append({
                'head': head,
                'relation': relation,
                'tail': tail
            })
            
        final_output_data[doc_id] = {
            'title': combined_info['title'],
            'entities': sorted(final_entities, key=lambda x: (x['type'], x['mentions'][0] if x['mentions'] else '')),  # Sort for consistency
            'triples': sorted(final_triples, key=lambda x: (x['head'], x['relation'], x['tail']))  # Sort for consistency
        }
        total_entities_after += len(final_entities)
        total_triples_after += len(final_triples)

    report = {
        "processed_files": valid_input_files,
        "unique_documents_combined": len(processed_doc_ids),
        "total_unique_entities_across_docs": total_entities_after,
        "total_unique_triples_across_docs": total_triples_after
    }

    return report, final_output_data

def save_combined_data(data, output_file):
    """Saves the combined document data to a JSON file."""
    try:
        output_dir = os.path.dirname(output_file)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)
            print(f"Created output directory: {output_dir}")
            
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"\n✅ Combined document data successfully saved to: {output_file}")
    except Exception as e:
        print(f"\n❌ Error saving combined data to {output_file}: {str(e)}")

def print_report(report):
    """Prints the summary report for document combination."""
    print("\n=== Document Combination Report ===")
    print(f"Processed Files ({len(report['processed_files'])}):")
    for filename in report['processed_files']:
        print(f"  - {filename}")
    print(f"Unique Documents Combined: {report['unique_documents_combined']}")
    print(f"Total Unique Entities (across all docs): {report['total_unique_entities_across_docs']}")
    print(f"Total Unique Triples (across all docs): {report['total_unique_triples_across_docs']}")
    print("=================================")

# Example usage in Jupyter Notebook:
if __name__ == "__main__":
    # Replace these with your actual file paths
    input_files = [
        "deepseek/results.json",
        "llama-3.3-70b-versatile/results.json",
        "qwen-2.5/results.json",
        # Add more files as needed
    ]
    
    output_file = "ensemble1/results.json"  # Change this to your desired output path
    
    # Process the files
    report, combined_data = combine_documents(input_files)
    
    # Print the report
    print_report(report)
    
    # Save the output
    save_combined_data(combined_data, output_file)
    
    # The combined data is also available in the combined_data variable

Processing input files...
  - Processing results.json
  - Processing results.json
  - Processing results.json

Consolidating results...

=== Document Combination Report ===
Processed Files (3):
  - results.json
  - results.json
  - results.json
Unique Documents Combined: 248
Total Unique Entities (across all docs): 11906
Total Unique Triples (across all docs): 10247

✅ Combined document data successfully saved to: ensemble1/results.json


In [8]:
# read the json file
with open("ensemble1/results.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# print the data
print(len(data))



248
