In [1]:
import json
import re

import re

def get_token_indices(text, ann_start, ann_end):
    """
    Given the text and a character span (ann_start, ann_end),
    return the token indices as a list:
      - For a single-token span: [i]
      - For a multi-token span: [i, j, k, ...] (all tokens that fall within the span)
    Tokens are obtained by splitting on whitespace.
    """
    tokens = list(re.finditer(r'\S+', text))
    indices = []
    for i, token in enumerate(tokens):
        t_start, t_end = token.start(), token.end()
        # Include token index if the token lies completely within the annotation span.
        if t_start >= ann_start and t_end <= ann_end:
            indices.append(i)
    return indices

def process_tweet(item):
    # Use the cleaned text if available; otherwise fall back to full_text.
    data = item.get("data", {})
    tweet_text = data.get("cleaned_full_text") or data.get("full_text", "")
    tweet_text = tweet_text.strip()

    # Combine annotation results from both "annotations" and "drafts"
    combined_results = []
    if "annotations" in item:
        for ann in item["annotations"]:
            if "result" in ann:
                combined_results.extend(ann["result"])
    if "drafts" in item:
        for draft in item["drafts"]:
            if "result" in draft:
                combined_results.extend(draft["result"])
    
    # Build a dictionary of label items (aspect/opinion) keyed by their id
    label_dict = {}
    relation_items = []  # To hold relation objects
    for res in combined_results:
        if res.get("type") == "relation":
            # For a relation item, record the linked ids and extract polarity.
            labels = res.get("labels", [])
            polarity = None
            for lab in labels:
                if lab.startswith("sent:"):
                    polarity = lab.split("sent:")[-1]
                    break
            relation_items.append({
                "from_id": res.get("from_id"),
                "to_id": res.get("to_id"),
                "polarity": polarity
            })
        elif res.get("type") == "labels":
            value = res.get("value", {})
            # Retrieve the annotation span and text.
            ann_start = value.get("start")
            ann_end = value.get("end")
            ann_text = value.get("text")
            # Determine if this is an "aspect" or "opinion" annotation.
            labels_list = value.get("labels", [])
            ann_type = None
            if "aspect" in labels_list:
                ann_type = "aspect"
            elif "opinion" in labels_list:
                ann_type = "opinion"
            if ann_type is not None and ann_start is not None and ann_end is not None:
                label_dict[res.get("id")] = {
                    "type": ann_type,
                    "start": ann_start,
                    "end": ann_end,
                    "text": ann_text
                }
    
    used_ids = set()
    triplets = []
    # First, form triplets using relation items.
    for rel in relation_items:
        from_id = rel.get("from_id")
        to_id = rel.get("to_id")
        polarity = rel.get("polarity")
        if from_id in label_dict and to_id in label_dict:
            aspect_item = label_dict[from_id]
            opinion_item = label_dict[to_id]
            used_ids.add(from_id)
            used_ids.add(to_id)
            aspect_indices = get_token_indices(tweet_text, aspect_item["start"], aspect_item["end"])
            opinion_indices = get_token_indices(tweet_text, opinion_item["start"], opinion_item["end"])
            triplets.append((aspect_indices, opinion_indices, polarity))
    
    # Next, for any unpaired annotations, try to pair one aspect with one opinion.
    unpaired_aspects = []
    unpaired_opinions = []
    for id_key, ann in label_dict.items():
        if id_key not in used_ids:
            if ann["type"] == "aspect":
                unpaired_aspects.append(ann)
            elif ann["type"] == "opinion":
                unpaired_opinions.append(ann)
    # Sort unpaired items by their start position.
    unpaired_aspects.sort(key=lambda x: x["start"])
    unpaired_opinions.sort(key=lambda x: x["start"])
    # Pair them by order. Here we assign a default polarity of "POS" (as in your example).
    for aspect_item, opinion_item in zip(unpaired_aspects, unpaired_opinions):
        aspect_indices = get_token_indices(tweet_text, aspect_item["start"], aspect_item["end"])
        opinion_indices = get_token_indices(tweet_text, opinion_item["start"], opinion_item["end"])
        triplets.append((aspect_indices, opinion_indices, "POS"))
    
    # Format the triplets as a string (Python literal format).
    triplets_str = str(triplets)
    # Build the output line using the separator "#### #### ####".
    output_line = tweet_text + "#### #### ####" + triplets_str
    return output_line

def convert_json_to_spanaste_format(json_file, output_file):
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    with open(output_file, "w", encoding="utf-8") as fout:
        for item in data:
            line = process_tweet(item)
            fout.write(line + "\n")

if __name__ == "__main__":
    input_json = "TJ - self_training/batch 5/batch 5.json"  # your JSON file
    output_txt = "TJ - self_training//batch 5/spanaste_format.txt"  # desired output filename
    convert_json_to_spanaste_format(input_json, output_txt)
    print("Conversion complete. Output saved to", output_txt)


Conversion complete. Output saved to TJ - self_training//batch 5/spanaste_format.txt
