In [259]:
# Author: Konstantin
# Modified by: Adrielli

import os
import re
import json
import argparse
from typing import List, Tuple
import pandas as pd, ast

# pd.set_option("display.max_colwidth", None)
# pd.set_option("display.max_rows", None)   
# pd.set_option("display.max_columns", None) 

In [260]:
allowed_text_type = 'wed'
allowed_text_model = 'relik-cie-small'
allowed_text_corpus = 'meco'

model_name = re.escape(allowed_text_model)
pattern = rf"output_step_(\d+)_{model_name}_([^_]+)_.*\.json"

filename_re = re.compile(pattern, re.IGNORECASE)

In [261]:
files_by_type = {}
for file_name in os.listdir("."):
    match = filename_re.match(file_name)
    if not match:
        continue
    step_num = int(match.group(1))
    text_type = match.group(2).lower()
    if text_type != allowed_text_type:
        continue
    else:
        files_by_type.setdefault(text_type, []).append((step_num, file_name))
print("Files by type:", files_by_type)

Files by type: {'wed': [(1, 'output_step_001_relik-cie-small_WED_0.1_128.json'), (2, 'output_step_002_relik-cie-small_WED_0.1_128.json'), (3, 'output_step_003_relik-cie-small_WED_0.1_128.json'), (4, 'output_step_004_relik-cie-small_WED_0.1_128.json'), (5, 'output_step_005_relik-cie-small_WED_0.1_128.json'), (6, 'output_step_006_relik-cie-small_WED_0.1_128.json'), (7, 'output_step_007_relik-cie-small_WED_0.1_128.json'), (8, 'output_step_008_relik-cie-small_WED_0.1_128.json'), (9, 'output_step_009_relik-cie-small_WED_0.1_128.json'), (10, 'output_step_010_relik-cie-small_WED_0.1_128.json'), (11, 'output_step_011_relik-cie-small_WED_0.1_128.json'), (12, 'output_step_012_relik-cie-small_WED_0.1_128.json'), (13, 'output_step_013_relik-cie-small_WED_0.1_128.json'), (14, 'output_step_014_relik-cie-small_WED_0.1_128.json'), (15, 'output_step_015_relik-cie-small_WED_0.1_128.json'), (16, 'output_step_016_relik-cie-small_WED_0.1_128.json'), (17, 'output_step_017_relik-cie-small_WED_0.1_128.json'),

In [262]:
rows_add = []
rows_drop = []
rows_all = []
for text_type, files in files_by_type.items():
    files.sort(key=lambda x: x[0])
    prev_set = set()
    seen_ever = set()

    for step, file_name in files:
        with open(file_name, encoding="utf-8") as f:
            data = json.load(f)

        simplified_triplets = []
        triplet_scores = []
        for raw in data.get("triplets", []):
            # raw is [head, rel, tail, score]
            head, rel, tail, *rest = raw
            score = rest[0] if rest else None
        
            # HEAD 
            if isinstance(head, list):
                canon_h = head[2] if len(head) > 2 else ""
                surface_h = head[3] if len(head) > 3 else ""
                head_name = f"{canon_h} | {surface_h}" if surface_h and surface_h != canon_h else canon_h
            else:
                head_name = str(head)

            # TAIL
            if isinstance(tail, list):
                canon_t = tail[2] if len(tail) > 2 else ""
                surface_t = tail[3] if len(tail) > 3 else ""
                tail_name = f"{canon_t} | {surface_t}" if surface_t and surface_t != canon_t else canon_t
            else:
                tail_name = str(tail)

            simplified_triplets.append((head_name, rel, tail_name))
            triplet_scores.append(score)
        # print("Simplified triplets:", simplified_triplets)

        curr_set = set(simplified_triplets)
        impacted = curr_set != prev_set
        added_trips = [t for t in curr_set if t not in prev_set]
        dropped_trips = [t for t in prev_set if t not in curr_set]

        rows_all.append({
            "text_type": text_type,
            "output_step": step,
            "current_word": data["text"].split()[-1],
            "triplet_impacted": "yes" if impacted else "no",
            "current_text": data["text"],
            "total_triplets": simplified_triplets,
            "triplet_scores": triplet_scores
        })

        new_trips = [t for t in added_trips if t not in seen_ever]

        if new_trips:
            seen_ever.update(new_trips)
            rows_add.append({
                "text_type": text_type,
                "output_step": step,
                "current_word": data["text"].split()[-1],
                "current_text": data["text"],
                "new_triplets": new_trips,
                "total_triplets": simplified_triplets,
                "triplet_scores": triplet_scores,
            })

        if dropped_trips:
            rows_drop.append({
                "text_type": text_type,
                "output_step": step,
                "current_word": data["text"].split()[-1],
                "current_text": data["text"],
                "dropped_triplets": dropped_trips,
                "total_triplets": simplified_triplets,
                "triplet_scores": triplet_scores,
            })

        prev_set = curr_set

In [263]:
df_full = (pd.DataFrame(rows_all).sort_values(["text_type","output_step"]).reset_index(drop=True))
display(df_full)
df_full.to_csv(f"full_{allowed_text_model}_{allowed_text_corpus}_{allowed_text_type}.csv", index=False)

Unnamed: 0,text_type,output_step,current_word,triplet_impacted,current_text,total_triplets,triplet_scores
0,wed,1,World,no,World,[],[]
1,wed,2,Environment,no,World Environment,[],[]
2,wed,3,Day,no,World Environment Day,[],[]
3,wed,4,(WED),no,World Environment Day (WED),[],[]
4,wed,5,is,no,World Environment Day (WED) is,[],[]
...,...,...,...,...,...,...,...
159,wed,160,plastic,yes,World Environment Day (WED) is celebrated on t...,"[(United Nations | United Nation, statement is...","[0.15000000596046448, 0.10000000149011612, 0.1..."
160,wed,161,in,yes,World Environment Day (WED) is celebrated on t...,"[(United Nations | United Nation, statement is...","[0.15000000596046448, 0.10000000149011612, 0.1..."
161,wed,162,India,yes,World Environment Day (WED) is celebrated on t...,"[(United Nations | United Nation, statement is...","[0.15000000596046448, 0.10000000149011612, 0.1..."
162,wed,163,by,no,World Environment Day (WED) is celebrated on t...,"[(United Nations | United Nation, statement is...","[0.15000000596046448, 0.10000000149011612, 0.1..."


In [264]:
additions_df = pd.DataFrame(rows_add).sort_values("output_step").reset_index(drop=True)
display(additions_df)
additions_df.to_csv(f"additions_{allowed_text_model}_{allowed_text_corpus}_{allowed_text_type}.csv", index=False)

Unnamed: 0,text_type,output_step,current_word,current_text,new_triplets,total_triplets,triplet_scores
0,wed,19,principal,World Environment Day (WED) is celebrated on t...,"[(World Environment Day | WED, conferred by, U...","[(World Environment Day, conferred by, United ...","[0.18000000715255737, 0.17000000178813934]"
1,wed,20,vehicle,World Environment Day (WED) is celebrated on t...,"[(World Environment Day | WED, conferred by, U...","[(World Environment Day, conferred by, United ...","[0.5299999713897705, 0.550000011920929]"
2,wed,24,and,World Environment Day (WED) is celebrated on t...,"[(World Wildlife Day | WED, conferred by, Unit...","[(World Environment Day, conferred by, United ...","[0.3799999952316284, 0.36000001430511475]"
3,wed,51,"pollution,",World Environment Day (WED) is celebrated on t...,"[(Marine pollution | marine pollution, stateme...","[(World Environment Day, conferred by, United ...","[0.47999998927116394, 0.25999999046325684, 0.1..."
4,wed,53,"overpopulation,",World Environment Day (WED) is celebrated on t...,"[(Human overpopulation | human overpopulation,...","[(World Environment Day, conferred by, United ...","[0.6299999952316284, 0.4000000059604645, 0.109..."
5,wed,57,WED,World Environment Day (WED) is celebrated on t...,"[(United Nations | United Nation, significant ...","[(United Nations | United Nation, significant ...","[0.1599999964237213, 0.10999999940395355, 0.12..."
6,wed,61,become,World Environment Day (WED) is celebrated on t...,"[(United Nations | United Nation, statement is...","[(United Nations | United Nation, statement is...","[0.10000000149011612, 0.15000000596046448, 0.1..."
7,wed,78,WED,World Environment Day (WED) is celebrated on t...,"[(World Environment Day | WED, instance of, Wo...","[(Human overpopulation | human overpopulation,...","[0.1899999976158142, 0.3100000023841858, 0.119..."
8,wed,79,chooses,World Environment Day (WED) is celebrated on t...,"[(--NME-- | WED, instance of, --NME-- | WED)]","[(Climate change | global warming, statement i...","[0.20000000298023224, 0.23000000417232513]"
9,wed,87,"organizations,",World Environment Day (WED) is celebrated on t...,"[(--NME-- | marine pollution, statement is sub...","[(--NME-- | marine pollution, statement is sub...","[0.10999999940395355, 0.14000000059604645, 0.6..."


In [265]:
deletions_df = pd.DataFrame(rows_drop).sort_values("output_step").reset_index(drop=True)
display(deletions_df)
deletions_df.to_csv(f"deletions_{allowed_text_model}_{allowed_text_corpus}_{allowed_text_type}.csv", index=False)

Unnamed: 0,text_type,output_step,current_word,current_text,dropped_triplets,total_triplets,triplet_scores
0,wed,20,vehicle,World Environment Day (WED) is celebrated on t...,"[(World Environment Day | WED, conferred by, U...","[(World Environment Day, conferred by, United ...","[0.5299999713897705, 0.550000011920929]"
1,wed,21,for,World Environment Day (WED) is celebrated on t...,"[(World Environment Day | WED, conferred by, U...","[(World Environment Day, conferred by, United ...","[0.4000000059604645, 0.3199999928474426]"
2,wed,24,and,World Environment Day (WED) is celebrated on t...,"[(World Environment Day | WED, conferred by, U...","[(World Environment Day, conferred by, United ...","[0.3799999952316284, 0.36000001430511475]"
3,wed,25,action,World Environment Day (WED) is celebrated on t...,"[(World Wildlife Day | WED, conferred by, Unit...","[(World Environment Day, conferred by, United ...","[0.28999999165534973, 0.25999999046325684]"
4,wed,26,for,World Environment Day (WED) is celebrated on t...,"[(World Environment Day | WED, conferred by, U...","[(World Environment Day, conferred by, United ...","[0.30000001192092896, 0.3100000023841858]"
...,...,...,...,...,...,...,...
60,wed,157,eliminate,World Environment Day (WED) is celebrated on t...,"[(Marine plastic pollution | marine pollution,...","[(United Nations | United Nation, statement is...","[0.15000000596046448, 0.10000000149011612, 0.1..."
61,wed,158,all,World Environment Day (WED) is celebrated on t...,"[(Climate change | global warming, has cause, ...","[(United Nations | United Nation, statement is...","[0.15000000596046448, 0.10000000149011612, 0.1..."
62,wed,159,single-use,World Environment Day (WED) is celebrated on t...,"[(Climate change | global warming, has cause, ...","[(United Nations | United Nation, statement is...","[0.15000000596046448, 0.10000000149011612, 0.1..."
63,wed,160,plastic,World Environment Day (WED) is celebrated on t...,[(--NME-- | WED chooses a new theme that major...,"[(United Nations | United Nation, statement is...","[0.15000000596046448, 0.10000000149011612, 0.1..."
