In [5]:
import os
import re
import json
import pandas as pd


# -------------------------------
# Helper Functions
# -------------------------------

input_directory = "../enriched_sample_subset"

def parse_jsonl(jsonl_path):
    records = []
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
                records.append(obj)
            except json.JSONDecodeError as e:
                print(f"Skipping a line in {jsonl_path} due to JSON error: {e}")
    return records

def parse_analysis_txt_list(analysis_path):
    with open(analysis_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    blocks = re.split(r'-{5,}', content)
    records = []
    for block in blocks:
        block = block.strip()
        if not block:
            continue
        
        rec_match = re.search(r'Record\s+(\d+).*?\(Version:\s*([^)]+)\)', block, re.IGNORECASE)
        if rec_match:
            record_num = int(rec_match.group(1))
            version_val = rec_match.group(2).strip()
        else:
            record_num = None
            version_val = None
        
        judgment_match = re.search(r'\*\*Judgment:\*\*\s*(.*)', block, re.IGNORECASE)
        judgment_val = judgment_match.group(1).strip() if judgment_match else "Not Found"
        #print(judgment_val)
        analysis_match = re.search(r'\*\*Explanation:\*\*\s*(.*)', block, re.IGNORECASE)
        analysis_val = analysis_match.group(1).strip() if analysis_match else block
        
        records.append({
            "record": record_num,
            "version": version_val,
            "analysis": analysis_val,
            "judgment": judgment_val,
            "text": block
        })
    return records

# -------------------------------
# Main Script
# -------------------------------

jsonl_files = [
    os.path.join(input_directory, f)
    for f in os.listdir(input_directory)
    if f.endswith(".jsonl")
]


for json_file in jsonl_files:
    analysis_file = json_file.replace("/enriched_sample_subset", "/txt_results").replace("_enriched_subsampled.jsonl", "_subsampled_analysis.txt")
    if not os.path.exists(json_file):
        print(f"Skipping {json_file} - file not found.")
        continue
    if not os.path.exists(analysis_file):
        print(f"Skipping {analysis_file} - file not found.")
        continue
    
    json_records = parse_jsonl(json_file)
    analysis_records = parse_analysis_txt_list(analysis_file)
    
    num_pairs = min(len(json_records), len(analysis_records))
    csv_rows = []

    for i in range(num_pairs):
        rec = json_records[i]
        analysis_data = analysis_records[i]


        analysis = analysis_data.get("analysis", "")
        comment = rec.get("Comment", "")
        if comment and comment != "":
            final_text = f"Analysis of the edit: {analysis} Comment by its editor: '{comment}'"
        else:
            final_text = f"Analysis of the edit: {analysis}"

        #print(analysis_data)
        csv_rows.append({
            "Source": json_file,
            "Timestamp": rec.get("Timestamp", ""),
            "User": rec.get("User", ""),
            "Comment": rec.get("Comment", ""),
            "Diff": rec.get("Diff", ""),
            "Added_Lines": " | ".join(rec.get("Added_Lines", [])),
            "Removed_Lines": " | ".join(rec.get("Removed_Lines", [])),
            "Added_Words": " | ".join(rec.get("Added_Words", [])),
            "Removed_Words": " | ".join(rec.get("Removed_Words", [])),
            "Judgment": analysis_data.get("judgment", ""),
            "Analysis": analysis_data.get("analysis", ""),
            "final_text": final_text
        })

    # Create and save DataFrame per file
    df = pd.DataFrame(csv_rows)
    output_csv = json_file.replace("/enriched_sample_subset", "/csv_files").replace("_enriched_subsampled.jsonl", "_subsampled_output.csv")
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"Wrote {len(csv_rows)} rows to {output_csv}")

Wrote 865 rows to ../csv_files/Armenia_subsampled_output.csv
Wrote 547 rows to ../csv_files/Yerevan_subsampled_output.csv
Wrote 542 rows to ../csv_files/Nagorno-Karabakh_subsampled_output.csv
Wrote 513 rows to ../csv_files/Adana_subsampled_output.csv
Wrote 472 rows to ../csv_files/Armenians_subsampled_output.csv
Wrote 358 rows to ../csv_files/Armenian_language_subsampled_output.csv
Wrote 338 rows to ../csv_files/Mount_Ararat_subsampled_output.csv
Wrote 324 rows to ../csv_files/Armenian_genocide_recognition_subsampled_output.csv
Wrote 313 rows to ../csv_files/Shusha_subsampled_output.csv
Wrote 302 rows to ../csv_files/Dolma_subsampled_output.csv
Wrote 272 rows to ../csv_files/Urartu_subsampled_output.csv
Wrote 238 rows to ../csv_files/History_of_Armenia_subsampled_output.csv
Wrote 223 rows to ../csv_files/Armenian_Revolutionary_Federation_subsampled_output.csv
Wrote 220 rows to ../csv_files/Foreign_relations_of_Armenia_subsampled_output.csv
Wrote 187 rows to ../csv_files/Armed_Forces_of