In [None]:
%pip install jsonlines

clear_output()

In [None]:
import os
import subprocess
import threading
from IPython.display import clear_output
import jsonlines
import requests
import json

In [None]:
# Install necessary packages
!sudo apt-get install -y pciutils
!curl -fsSL https://ollama.com/install.sh | sh  # Download Ollama API

# Start the Ollama API server in a separate thread
def ollama():
    os.environ['OLLAMA_HOST'] = '0.0.0.0:11434'  # localhost
    os.environ['OLLAMA_ORIGINS'] = '*'  # Allow all origins
    subprocess.Popen(["ollama", "serve"])

ollama_thread = threading.Thread(target=ollama)
ollama_thread.start()

# clear_output()

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
pciutils is already the newest version (1:3.7.0-6).
0 upgraded, 0 newly installed, 0 to remove and 4 not upgraded.
>>> Cleaning up old version at /usr/local/lib/ollama
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
############################################################################################# 100.0%
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [None]:
# !ollama pull llama3.1:70b
!ollama pull phi4
clear_output()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/SemEval_Task10
%ls

/content/drive/MyDrive/SemEval_Task10
[0m[01;34mdev-documents_4_December[0m/                [01;34msubtask-1-results[0m/          training_data.jsonl
[01;34msemeval2025task10-scorers-baselines-v2[0m/  [01;34mtarget_4_December_release[0m/
SemEval-Task10-Subtask01.ipynb           [01;34mtestdata_ST12[0m/


In [None]:
def load_data(language_folder):
    articles_folder = os.path.join(language_folder, "subtask-1-documents")
    entity_file = os.path.join(language_folder, "subtask-1-entity-mentions.txt")
    data = []

    # Load articles and entities
    for article_file in os.listdir(articles_folder):
        if '(' not in article_file:
          article_path = os.path.join(articles_folder, article_file)
          with open(article_path, 'r') as file:
              article_text = file.read()

          # Load corresponding entities with indices
          with open(entity_file, 'r') as file:
              entities = [
                  {
                      "entity_name": line.strip().split('\t')[1],  # Extract entity name
                      "start_index": int(line.strip().split('\t')[2]),  # Extract start index
                      "end_index": int(line.strip().split('\t')[3])    # Extract end index
                  }
                  for line in file.readlines()
                  if line.startswith(article_file)
              ]

          for entity in entities:
              article_id = article_file + ".txt" if ".txt" not in article_file else article_file
              entity_name = entity["entity_name"]
              start_index = entity["start_index"]
              end_index = entity["end_index"]
              data.append({"article_id": article_file,
                           "article_text": article_text,
                           "entity_name": entity_name,
                           "start_index": start_index,
                           "end_index": end_index})

    return data

In [None]:
def classify_entities(article_id, article_text, entity_name, api_url="http://0.0.0.0:11434"):

    # print(f"{article_id} {entity_name}")
    prompt = (
    f"Given the article with articleID {article_id}:\n'{article_text}'\n"
    f"and the entity: {entity_name}, classify the entity into one of the following **primary roles**:\n"
    f"- 'Protagonist'\n"
    f"- 'Antagonist'\n"
    f"- 'Innocent'\n\n"
    f"The classification must reflect the author's sentiment toward the entity as expressed in the article.\n\n"
    f"Next, assign one or more **fine-grained roles**, strictly chosen from the list associated with the assigned primary role:\n"
    f"- **Protagonist**: ['Guardian', 'Martyr', 'Peacemaker', 'Rebel', 'Underdog', 'Virtuous']\n"
    f"- **Antagonist**: ['Instigator', 'Conspirator', 'Tyrant', 'Foreign Adversary', 'Traitor', 'Spy', "
    f"'Saboteur', 'Corrupt', 'Incompetent', 'Terrorist', 'Deceiver', 'Bigot']\n"
    f"- **Innocent**: ['Forgotten', 'Exploited', 'Victim', 'Scapegoat']\n\n"
    f"**Important Requirements:**\n"
    f"1. Assign exactly one **primary role** ('Protagonist', 'Antagonist', or 'Innocent').\n"
    f"2. Assign one or more **fine-grained roles**, strictly from the associated list above.\n"
    f"3. Do not invent or use roles that are not listed above.\n"
    f"4. Do not leave the primary role or fine-grained roles empty or undefined.\n\n"
    f"**Failure Examples:**\n"
    f"- Assigning a primary role not listed (e.g., 'Neutral').\n"
    f"- Assigning fine-grained roles not listed (e.g., 'Aggressor', 'Fascist Leader', 'Extremist', 'Scam', 'Expansionist', 'Imperialist', 'Military', 'Propagandists', etc.\n"
    f"- Leaving the fine-grained roles empty.\n\n"
    f"Make sure that the classification strictly follows these rules. Your response should only include the assigned **primary role** and the corresponding **fine-grained roles**."
)


    response = requests.post(
    url=f"{api_url}/api/generate",
    json={
        "model": "gemma2",
        "prompt": prompt,
        "stream": False,
        "format": {
            "type": "object",
            "properties": {
                "primary_role": {
                    "type": "string"
                },
                "fine_grained_roles": {
                    "type": "array",
                    "items": {
                        "type": "string"
                    }
                }
            },
            "required": [
                "primary_role",
                "fine_grained_roles"
            ]
        }
    }
)

    if response.status_code == 200:
        try:
            output = response.json()
            return output['response']
        except ValueError as e:
            print(f"Failed for {article_id}")
            print("Error:", e)
    else:
        print(f"API call failed for {article_id}, Status code: {response.status_code}")
    return None

In [None]:
def process_language(language_folder, base_output_path):
    """
    Process all articles in a given language folder. Classifies entities and saves results.
    """
    language_code = os.path.basename(language_folder)
    language_output_path = os.path.join(base_output_path, language_code)
    os.makedirs(language_output_path, exist_ok=True)

    data = load_data(language_folder)  # Load articles and entities with indices
    results = []

    for item in data:
        article_id = item["article_id"]
        article_text = item["article_text"]
        entity_name = item["entity_name"]
        start_index = item["start_index"]
        end_index = item["end_index"]

        # Classify all entities at once
        result = classify_entities(article_id, article_text, entity_name)
        if result:
          result = json.loads(result)
          result["articleID"] = article_id
          result["entity_name"] = entity_name
          result["start_index"] = start_index
          result["end_index"] = end_index
          result = json.dumps(result)

          results.append(result)

          # print(result)

        # break

    # Save results to a text file
    output_file = os.path.join(language_output_path, f"{language_code}_results.jsonl")
    with jsonlines.open(output_file, mode='w') as writer:
        writer.write_all(results)

    print(f"Processed {language_code}, results saved to {output_file}")


In [None]:
def main():
    base_path = "/content/drive/MyDrive/SemEval_Task10/testdata_ST12/"
    base_output_path = "/content/drive/MyDrive/SemEval_Task10/testdata_ST12/results"
    # languages = ["EN", "BG", "HI", "PT", "RU"]
    languages = ["RU"]

    for lang in languages:
        language_folder = os.path.join(base_path, lang)
        process_language(language_folder, base_output_path)

main()

Processed RU, results saved to /content/drive/MyDrive/SemEval_Task10/testdata_ST12/results/RU/RU_results.jsonl


In [None]:
import json
import re

def clean_up_result(input_file, output_file):
    numLines = 0
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            line = line.strip()

            if line:
                # Parse the JSON object
                try:
                    entity_data = json.loads(json.loads(line))

                    # print(entity_data)

                    # Extract required fields
                    article_id = entity_data["articleID"]+ ".txt"  if ".txt" not in entity_data["articleID"] else entity_data["articleID"]
                    entity_name = entity_data["entity_name"]
                    start_index = entity_data["start_index"]
                    end_index = entity_data["end_index"]
                    primary_role = entity_data["primary_role"]
                    fine_grained_roles = entity_data["fine_grained_roles"]

                    # print(article_id)

                    output_line = f"{article_id}\t{entity_name}\t{start_index}\t{end_index}\t{primary_role}\t" + "\t".join(fine_grained_roles) + "\n"
                    outfile.write(output_line)
                except json.JSONDecodeError:
                    continue

            numLines += 1

    print(f"DONE with {numLines} lines")

language = "RU"
input_file=f"/content/drive/MyDrive/SemEval_Task10/testdata_ST12/results/{language}/{language}_results.jsonl"
output_file=f"/content/drive/MyDrive/SemEval_Task10/testdata_ST12/results/{language}/{language}_cleaned.txt"
clean_up_result(input_file, output_file)


DONE with 214 lines


In [None]:
from collections import Counter

def clean_results_file(file_path):
    fine_grained_roles = {
        'Guardian', 'Martyr', 'Peacemaker', 'Rebel', 'Underdog', 'Virtuous',
        'Instigator', 'Conspirator', 'Tyrant', 'Foreign Adversary', 'Traitor', 'Spy', 'Saboteur',
        'Corrupt', 'Incompetent', 'Terrorist', 'Deceiver', 'Bigot',
        'Forgotten', 'Exploited', 'Victim', 'Scapegoat'
    }

    valid_primary_roles = {'Protagonist', 'Antagonist', 'Innocent'}
    corrections = 0  # Total number of corrections made
    filtered_roles_counter = Counter()  # To track and count filtered roles
    cleaned_lines = []

    with open(file_path, 'r') as file:
        for line in file:
            fields = line.strip().split('\t')

            # Check and correct invalid primary roles
            if len(fields) > 4 and fields[4] not in valid_primary_roles:
                fields[4] = "Innocent"
                corrections += 1

            # Check and filter invalid fine-grained roles
            initial_roles = fields[5:]
            filtered_roles = [role for role in initial_roles if re.sub(r'[^a-zA-Z]', '', role) in fine_grained_roles]
            invalid_roles = [role for role in initial_roles if role not in fine_grained_roles]

            # Increment corrections and update role counter
            corrections += len(invalid_roles)
            filtered_roles_counter.update(invalid_roles)

            # Create cleaned line
            cleaned_fields = fields[:5] + filtered_roles
            cleaned_lines.append('\t'.join(cleaned_fields))

    # Save cleaned data back to the same file
    with open(file_path, 'w') as file:
        file.write('\n'.join(cleaned_lines) + '\n')

    # Display correction summary
    print(f"Cleaned results saved back to {file_path}")
    print(f"Total corrections made: {corrections}")
    print("Filtered roles and their counts:")
    for role, count in filtered_roles_counter.items():
        print(f"- {role}: {count}")

language = "RU"
file_path = f"/content/drive/MyDrive/SemEval_Task10/testdata_ST12/results/{language}/{language}_cleaned.txt"
clean_results_file(file_path)

Cleaned results saved back to /content/drive/MyDrive/SemEval_Task10/testdata_ST12/results/RU/RU_cleaned.txt
Total corrections made: 16
Filtered roles and their counts:
- Innocent: 1
- Propagandists: 2
- ],[Tyrant: 1
- ]: 1
- >Tyrant: 1
- ,: 1
- Pressure: 1
- Agent: 1


In [None]:
!ollama list

NAME            ID              SIZE      MODIFIED       
phi4:latest     ac896e5b8b34    9.1 GB    13 seconds ago    
llama3.1:70b    711a9e8463af    42 GB     23 minutes ago    


In [None]:
# !pip install ollama
# !pip install tiktoken
# clear_output()
# import os
# import json
# import jsonlines
# import requests
# import ollama

In [None]:
# import tiktoken
# import requests

# def count_tokens_tiktoken(prompt, model="gpt-4"):
#     """
#     Count tokens using OpenAI's tiktoken, compatible with Llama3.
#     """
#     enc = tiktoken.encoding_for_model(model)  # Use GPT-4 tokenizer for Llama3
#     return len(enc.encode(prompt))

In [None]:
def load_aggregated_results(file_paths):
    """
    Load predictions from multiple models and structure them for meta-classification.
    """
    aggregated_data = {}

    for file_path in file_paths:
        with open(file_path, 'r') as infile:
            for line in infile:
                fields = line.strip().split('\t')
                article_id = fields[0]
                entity_name = fields[1]
                start_index = fields[2]
                end_index = fields[3]
                primary_role = fields[4]
                fine_grained_roles = fields[5:]

                key = (article_id, entity_name, start_index, end_index)

                if key not in aggregated_data:
                    aggregated_data[key] = {"primary_roles": [], "fine_grained_roles": []}

                aggregated_data[key]["primary_roles"].append(primary_role)
                aggregated_data[key]["fine_grained_roles"].extend(fine_grained_roles)

    return aggregated_data

In [None]:
def meta_classification(article_id, article_text, entity_name, votes, fine_grained_votes, api_url="http://0.0.0.0:11434"):
    """
    Uses Phi4 to determine the best classification based on votes.
    """

    prompt = (
        f"Given the article with ID: {article_id}:\n'{article_text}'\n\n"
        f"The entity: {entity_name} has been classified by multiple models as:\n"
        f"Primary Role Options: {primary_roles}\n"
        f"Fine-Grained Role Options: {fine_grained_votes}\n\n"
        f"Based on the given article and model predictions, determine the most appropriate primary_role and fine_grained_roles for each entity\n"
        f"Make sure to account for any role that occur multiple times in different models' predictions and give higher emphasis to those.\n\n"
        f"**Primary Roles:**\n
        f"- **Protagonist**: ['Guardian', 'Martyr', 'Peacemaker', 'Rebel', 'Underdog', 'Virtuous']\n"
        f"- **Antagonist**: ['Instigator', 'Conspirator', 'Tyrant', 'Foreign Adversary', 'Traitor', 'Spy', "
        f"'Saboteur', 'Corrupt', 'Incompetent', 'Terrorist', 'Deceiver', 'Bigot']\n"
        f"- **Innocent**: ['Forgotten', 'Exploited', 'Victim', 'Scapegoat']\n\n"
        f"**Important Requirements:**\n"
        f"1. Assign exactly one **primary role** ('Protagonist', 'Antagonist', or 'Innocent').\n"
        f"2. Assign one or more **fine-grained roles**, strictly from the associated list above.\n"
        f"3. Do not invent or use roles that are not listed above.\n"
        f"4. Do not leave the primary role or fine-grained roles empty or undefined.\n\n"
        f"**Failure Examples:**\n"
        f"- Assigning a primary role not listed (e.g., 'Neutral').\n"
        f"- Assigning fine-grained roles not listed (e.g., 'Aggressor', 'Fascist Leader', 'Extremist', 'Scam', 'Expansionist', 'Imperialist', 'Military', 'Propagandist', etc.\n"
        f"- Leaving the fine-grained roles empty.\n\n"
        f"Make sure that the classification strictly follows these rules. Your response should only include the assigned **primary role** and the corresponding **fine-grained roles**.""
)

    response = requests.post(
        url=f"{api_url}/api/generate",
        json={
            "model": "phi4",
            "prompt": prompt,
            "stream": False,
            "format": {
                "type": "object",
                "properties": {
                    "primary_role": {"type": "string"},
                    "fine_grained_roles": {"type": "array", "items": {"type": "string"}}
                },
                "required": ["primary_role", "fine_grained_roles"]
            }
        }
    )

    if response.status_code == 200:
        try:
            output = response.json()
            return output['response']
        except ValueError as e:
            print(f"Failed for {article_id}: {entity_name}")
            print("Error:", e)
    else:
        print(f"API call failed for {article_id}, Status code: {response.status_code}")

    return None

In [None]:
def process_final_classification(language_folder, base_output_path, file_paths):
    """
    Processes all entities using meta-classification and saves the final results.
    """
    language_code = os.path.basename(language_folder)
    os.makedirs(base_output_path, exist_ok=True)

    aggregated_data = load_aggregated_results(file_paths)
    final_results = []

    for (article_id, entity_name, start_index, end_index), data in aggregated_data.items():
        primary_role = data["primary_role"]
        fine_grained_votes = data["fine_grained_roles"]

        # Load the article text
        article_path = os.path.join(language_folder, "subtask-1-documents", article_id)
        with open(article_path, 'r') as file:
            article_text = file.read()

        # Use large model for final classification
        # print(article_id, article_text, entity_name, votes, fine_grained_votes)
        final_result = meta_classification(article_id, article_text, entity_name, votes, fine_grained_votes)

        # break

        if final_result:
            final_result = json.loads(final_result)
            final_result["articleID"] = article_id
            final_result["entity_name"] = entity_name
            final_result["start_index"] = start_index
            final_result["end_index"] = end_index
            final_results.append(final_result)

    # Save results to a text file
    output_file = os.path.join(base_output_path, f"{language_code}_final_results.jsonl")
    with jsonlines.open(output_file, mode='w') as writer:
        writer.write_all(final_results)

    print(f"Final classifications for {language_code} saved to {output_file}")

In [None]:
def main():
    base_path = "/content/drive/MyDrive/SemEval_Task10/testdata_ST12/"
    base_output_path = "/content/drive/MyDrive/SemEval_Task10/testdata_ST12/Test_Results_Final"

    language = "EN"
    language_folder = os.path.join(base_path, language)

    file_paths = [
        f"/content/drive/MyDrive/SemEval_Task10/testdata_ST12/Test_Results/Gemma2/{language}_cleaned.txt",
        f"/content/drive/MyDrive/SemEval_Task10/testdata_ST12/Test_Results/Llama3.1-8b/{language}_cleaned.txt",
        f"/content/drive/MyDrive/SemEval_Task10/testdata_ST12/Test_Results/Mistral-7b/{language}_cleaned.txt",
        f"/content/drive/MyDrive/SemEval_Task10/testdata_ST12/Test_Results/Phi4/{language}_cleaned.txt"
    ]

    process_final_classification(language_folder, base_output_path, file_paths)

main()