In [None]:
!pip install transformers accelerate bitsandbytes

In [None]:
import json
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import torch
from typing import List, Dict, Tuple, Optional
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def load_data(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)

    texts = [
        {"id": record.get("title", f"doc_{i}"), "text": record.get("doc", "")}
        for i, record in enumerate(data)
    ]

    unique_labels = {
        label
        for record in data
        for label in record.get("entity_label_set", [])
    }

    print(f"Loaded {len(texts)} texts.")
    file_name = os.path.splitext(os.path.basename(file_path))[0]

    return texts, unique_labels, file_name

In [None]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0]["generated_text"][-1])

In [None]:
def extract_entities(text: str, labels: list) -> dict:
    """
    Prompt the LLaMA model to extract entities of interest and return a dict mapping labels to lists of entities.
    """
    messages = [
      {"role": "system", "content": f"You are an expert in Named Entity Recognition. Please extract entities that match the schema definition from the input. Return an empty list if the entity type does not exist. Please respond in the format of a JSON string.\", \"schema\": {labels}"},
      {"role": "user", "content": text},
    ]

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
        messages,
        max_new_tokens=2048,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )
    return(outputs[0]["generated_text"][-1])

In [None]:
def convert_model_output_to_json(
    title: str,
    model_output: str,
    output_path: Optional[str] = None
) -> Dict[str, Any]:
    """
    Extracts a JSON object from a model output string and returns it as a Python dict.
    If `output_path` is provided, also writes the JSON to that file.

    Args:
        model_output: The raw string returned by the model, containing a JSON snippet.
        output_path: Optional path (including '.json') to save the extracted JSON.

    Returns:
        A Python dict representing the JSON data.
    """
    fence_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", model_output, re.DOTALL)
    if fence_match:
        json_str = fence_match.group(1)
    else:
        start = model_output.find('{')
        end = model_output.rfind('}') + 1
        if start == -1 or end == -1:
            raise ValueError("No JSON object found in the model output.")
        json_str = model_output[start:end]

    try:
        data = json.loads(json_str)
    except json.JSONDecodeError as e:
        raise ValueError(f"Failed to parse JSON: {e}")

    new_entry = {title: data}

    if output_path:
        if os.path.exists(output_path):
            with open(output_path, 'r', encoding='utf-8') as f:
                try:
                    existing_data = json.load(f)
                    if not isinstance(existing_data, list):
                        existing_data = [existing_data]
                except json.JSONDecodeError:
                    existing_data = []
            existing_data.append(new_entry)
            data_to_write = existing_data
        else:
            data_to_write = [new_entry]

        print(f"Writing JSON to {output_path}")
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(data_to_write, f, ensure_ascii=False, indent=4)

    return data



In [None]:
base_folder = "/content/drive/MyDrive/project_files/data/raw/dev"

for root, dirs, files in os.walk(base_folder):
    for filename in files:
        path = os.path.join(root, filename)

        texts, label_set, file_name = load_data(path)
        print(file_name)

        for item in texts:

            id = item["id"]
            text = item["text"]

            result = extract_entities(text, label_set)

            output_file = f"/content/drive/MyDrive/project_files/data/processed/baseline_output/{file_name}.json"

            try:
              convert_model_output_to_json(id, result["content"], output_file)
            except:
              print(f"Could not convert {id}. Saving raw conetent")
              with open(f"/content/drive/MyDrive/project_files/data/processed/baseline_output/{id}.json", "w") as f:
                f.write(result["content"])


In [None]:
class NEREvaluator:

    def __init__(
        self,
        text: str,
        gt_entities: List[Dict[str, any]],
        predicted_entities: List[Dict[str, str]],
    ):
        self.text = text

        self.gt_mentions: List[Tuple[str, str]] = []
        for ent in gt_entities:
            etype = ent["type"]
            for m in ent.get("mentions", []):
                self.gt_mentions.append((m, etype))

        self.pred_mentions: List[Tuple[str, str]] = [
            (ent["text"], ent["type"]) for ent in predicted_entities
        ]

    def evaluate(self) -> Dict[str, float]:

        print("check mention sets")
        print(self.gt_mentions)
        print(self.pred_mentions)
        remaining = self.gt_mentions.copy()
        tp = 0
        for m in self.pred_mentions:
            if m in remaining:
                tp += 1
                remaining.remove(m)

        fp = len(self.pred_mentions) - tp
        fn = len(self.gt_mentions) - tp

        precision = tp / (tp + fp) if tp + fp > 0 else 0.0
        recall    = tp / (tp + fn) if tp + fn > 0 else 0.0
        f1        = (2 * precision * recall / (precision + recall)
                     if precision + recall > 0 else 0.0)

        return {"precision": precision, "recall": recall, "f1": f1}

In [None]:
base_folder = "project_files/data/processed/baseline_output"

for root, dirs, files in os.walk(base_folder):
    for file_name in files:
        with open(f"project_files/data/raw/dev/{file_name}", "r") as f:
            gt = json.load(f)

        with open(f"project_files/data/processed/baseline_output/{file_name}") as f:
            preds = json.load(f)

        results = []            

        for p in preds:
            try:
                doc_id = list(p.keys())[0]
                pred = p[doc_id]["entities"]

                gt_entry = next((entry for entry in gt if entry["title"] == doc_id), None)
                
                if gt_entry is None:
                    print(f"No matching entry found for {doc_id}")
                    continue

                doc = gt_entry["doc"]
                label_set = gt_entry["entities"]

                evaluator = NEREvaluator(
                    doc,
                    label_set,
                    pred
                )

                metrics = evaluator.evaluate()
                results.append(
                    {
                        doc_id: metrics
                    }
                )
            except:
                 continue
            
        total_precision = 0
        total_recall = 0
        total_f1 = 0
        count = 0

        for result in results:
            for metrics in result.values():
                total_precision += metrics["precision"]
                total_recall += metrics["recall"]
                total_f1 += metrics["f1"]
                count += 1

        if count > 0:
            avg_metrics = {
                "average_precision": total_precision / count,
                "average_recall": total_recall / count,
                "average_f1": total_f1 / count
            }
            results.append({"average_metrics": avg_metrics})

        with open(f"project_files/data/processed/baseline_{file_name}", "w") as f:
                json.dump(results, f, ensure_ascii=False, indent=4)

        print(count)

check mention sets
[('Joseph Paul Forgas', 'PERSON'), ('Forgas', 'PERSON'), ('early 1990s', 'DATE'), ('mood', 'MISC'), ('negative affect', 'MISC'), ('affective states', 'MISC'), ('Cognitive capacity', 'MISC'), ('substantive processing', 'MISC'), ('Affect infusion model (AIM)', 'MISC'), ('human psychology', 'MISC'), ('affect infusion', 'MISC'), ('mood congruent', 'MISC'), ('Positive affect', 'MISC'), ('mood states', 'MISC'), ('affective processing', 'MISC'), ('cognition', 'MISC'), ('systematic processing', 'MISC'), ('Mood', 'MISC'), ('affect (mood and emotion)', 'MISC'), ('influence of mood on cognition', 'MISC'), ('Affect', 'MISC'), ('information', 'MISC'), ('affect', 'MISC'), ('processing strategy', 'MISC'), ('moods', 'MISC'), ('information processing', 'MISC'), ('Affect infusion model', 'MISC'), ('model', 'MISC'), ('process information', 'MISC'), ('Affect infusion', 'MISC'), ('mood incongruence', 'MISC'), ('cognitive processing', 'MISC'), ('mood congruence', 'MISC'), ('heuristic proc