In [42]:
! pip install -q ollama openai python-dotenv instructor jsonschema requests dotmap scikit-learn nltk
import os
from dotenv import load_dotenv
import nltk
from nltk.tokenize import sent_tokenize
from openai import OpenAI

import re
from typing import List, Literal
from pydantic import BaseModel, Field

from ollama import chat
import json

from sklearn.metrics import classification_report


# load OPENAI_API_KEY from environment variables
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
print('OpenAI API key loaded', openai_api_key[-5:])
gemini_api_key = os.getenv("GEMINI_API_KEY")
print('Gemini API key loaded', gemini_api_key[-5:])
llama_cloud_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
print('Llama Cloud API key loaded', llama_cloud_api_key[-5:])

nltk.download('punkt')
nltk.download('punkt_tab')

OpenAI API key loaded fJjIA
Gemini API key loaded _37Uw
Llama Cloud API key loaded HHuxU


[nltk_data] Downloading package punkt to /Users/rudie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/rudie/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Import ollama and verify it works (setup)


In [67]:
clients = {
  'openai': OpenAI(api_key=openai_api_key),
  'gemini': OpenAI(base_url='https://generativelanguage.googleapis.com/v1beta/openai/', api_key=gemini_api_key),
  'ollama': OpenAI(base_url='http://localhost:11434/v1', api_key='ollama'),
  'llamacloud': OpenAI(base_url='https://api.llamacloud.co', api_key=llama_cloud_api_key)
}
# models = ['llama3.1', 'gpt-4o-mini']
models = ['llama-3.2-8b', 'llama-3.3-70b', 'mistral-nemo','gpt-4o-mini', 'gpt-4o-2024-08-06']
# models = ['gpt-4o-mini']


model2client = {
  'aya': 'ollama',
  'llama-3.2-8b': 'llamacloud',
  'llama-3.3-70b': 'llamacloud',
  'mistral-nemo': 'llamacloud',
  'gpt-4o-mini': 'openai',
  'gpt-4o-2024-08-06': 'openai'
}

def get_client(model_name):
  return clients[model2client[model_name]]

In [64]:
def predict(model, messages, schema):
    if model2client[model] == 'ollama':
        response = chat(
            messages=messages,
            model=model,
            format=schema.model_json_schema()
        )
        response = schema.model_validate_json(response['message']['content'])
    elif model2client[model] == 'openai': 
        client = get_client(model)
        response = client.beta.chat.completions.parse(
            model=model,
            messages=messages,
            response_format=schema,
        )
        response = schema.model_validate_json(response.choices[0].message.content)
    elif model2client[model] == 'llamacloud':
        client = get_client(model)
        response = client.beta.chat.completions.parse(
            model=model,
            messages=messages,
            response_format=schema,
            max_tokens=4096,
        )
        response = schema.model_validate_json(response.choices[0].message.content)
    return response

# extract to utils.py
def resolve_fstrings(messages, context):
    pattern = re.compile(r'\{\{(\w+)\}\}')
    new_messages = []
    for msg in messages:
        content = msg.get("content", "")
        new_content = pattern.sub(lambda m: str(context[m.group(1)]), content)
        new_msg = msg.copy()
        new_msg["content"] = new_content
        new_messages.append(new_msg)
    return new_messages

assert resolve_fstrings([{"role": "system", "content": "Hello, {{name}}!"}], {"name": "world"}) == [{"role": "system", "content": "Hello, world!"}]


# Load the data

In [111]:
class Case(BaseModel):
    mask: str = Field(...)
    grammatical_cases: List[Literal[
        "nominative",
        "genitive",
        "dative",
        "accusative",
        "instrumental",
        "locative",
        "vocative"
    ]]


class Gender(BaseModel):
    grammatical_gender: Literal["masculine", "feminine", "any"]
    mask: str = Field(...)


class GendersWithCasesSchema(BaseModel):
    cases: List[Case]
    genders: List[Gender]


def parse_text_to_genders_with_cases(text: str) -> GendersWithCasesSchema:
    """
    Scans for patterns of the form:
      [<mask>|<grammatical_gender>|<grammatical_case>]
    in the given text, and returns a structure with:
      - a 'cases' list, each containing a 'mask' and a list of 'grammatical_cases'
      - a 'genders' list, each containing a 'mask' and a 'grammatical_gender'
    """

    # Regex that captures three groups:
    #   1) The mask (any text without '|' or ']')
    #   2) The gender (masculine|feminine)
    #   3) The case (nominative|genitive|...|vocative)
    pattern = r"\[([^\|\]]+)\|(masculine|feminine|any)\|(nominative|genitive|dative|accusative|instrumental|locative|vocative)\]"
    matches = re.findall(pattern, text)

    # We'll accumulate grammatical cases by mask
    mask_to_cases: dict[str, List[str]] = {}
    # We'll store a single gender per mask
    mask_to_gender: dict[str, str] = {}

    for mask, gender, grammatical_case in matches:
        # Append the grammatical case to the list for this mask
        if mask not in mask_to_cases:
            mask_to_cases[mask] = []
        mask_to_cases[mask].append(grammatical_case)

        # Record the gender if we haven't seen this mask yet.
        # (Assuming repeated masks always have the same gender.)
        if mask not in mask_to_gender:
            mask_to_gender[mask] = gender

    # Build Case objects from aggregated data
    all_cases: List[Case] = []
    for mask, cases in mask_to_cases.items():
        all_cases.append(
            Case(
                mask=mask,
                grammatical_cases=cases
            )
        )

    # Build Gender objects
    all_genders: List[Gender] = [
        Gender(
            mask=mask,
            grammatical_gender=gender,
        )
        for mask, gender in mask_to_gender.items()
    ]

    return GendersWithCasesSchema(cases=all_cases, genders=all_genders)


In [112]:
test_data_folder = "./court_cases/"
true_data_folder = "./court_cases_labeled/"

# Read test data
import os

test_data = []
for file in os.listdir(test_data_folder):
    with open(test_data_folder + file, 'r') as f:
        test_data.append(f.read())

# Read true data
true_data = []
for file in os.listdir(true_data_folder):
    with open(true_data_folder + file, 'r') as f:
        true_data.append(f.read())

assert len(test_data) == len(true_data)
assert len(test_data) > 0
print(true_data[0][:100])


# Parse true data
true_labels = {}

for idx, data in enumerate(true_data):
    true_labels[idx] = parse_text_to_genders_with_cases(data)

assert len(true_labels) == len(true_data)

print(true_labels[0])


Справа № 685/5/24
Провадження № 2/685/77/24 

У Х В А Л А
(про залишення позовної заяви без руху)

 
cases=[Case(mask='ОСОБА_1', grammatical_cases=['genitive', 'genitive', 'genitive']), Case(mask='ОСОБА_2', grammatical_cases=['genitive', 'genitive', 'genitive']), Case(mask='ОСОБА_3', grammatical_cases=['genitive']), Case(mask='ОСОБА_4', grammatical_cases=['nominative']), Case(mask='ОСОБА_5', grammatical_cases=['nominative'])] genders=[Gender(grammatical_gender='feminine', mask='ОСОБА_1'), Gender(grammatical_gender='feminine', mask='ОСОБА_2'), Gender(grammatical_gender='masculine', mask='ОСОБА_3'), Gender(grammatical_gender='feminine', mask='ОСОБА_4'), Gender(grammatical_gender='feminine', mask='ОСОБА_5')]


# Running predictions for both gender and cases at once

In [58]:
def run_prediction(template, schema, limit=None, verbose=False):
    y_pred = {}

    for model in models:
        y_pred[model] = {}

    for model in models:
        y_pred[model] = {}
        for idx, data in enumerate(test_data if limit is None else test_data[:limit]):
            entities = [entity.mask for entity in true_labels[idx].genders]
            
            data = {
                "entities": ', '.join(entities),
                "text": data
            }


            global messages
            print(data)
            messages = resolve_fstrings(template, data)

            if verbose:
                print(messages)

            response = predict(model, resolve_fstrings(template, data), schema)

            y_pred[model][idx] = response
    
    return y_pred

In [59]:
zeroshot_messages = [
    {
      "role": "system",
      "content": '''
        You are a professional linguist trained to determine the grammatical gender and grammatical case of entities in a Ukrainian text.

        You will receive:
        1. a list of entities and with number of their occurrences in the text.
        2. a text in Ukrainian containing an entity labeled as 'ОСОБА_'.

        Your task is to identify the grammatical gender and case of this entity, considering:
        1. The context of the text. E.g. a person can be a parent, verbs can have endings that indicates the gender.
        2. Interactions with other entities.
        3. Dependencies on other words and entities.
        4. Words surrounding the entity.

        Possible grammatical genders: 'masculine', 'feminine', 'any'. Any means that you can't determine but it can be either masculine or feminine.
        Possible cases:
        1. 'nominative' for the subject. Answers the question 'хто? що?'
        2. 'genitive' for possession. Answers the question 'кого? чого?'
        3. 'dative' for the indirect object. Answers the question 'кому? чому?'
        4. 'accusative' for the direct object. Answers the question 'кого? що?'
        5. 'instrumental' for the means. Answers the question 'ким? чим?'
        6. 'locative' for location. Answers the question 'на кому? на чому?'
        7. 'vocative' for addressing someone. 

        Rules:
        1. Make sure to cover all occurrences of the entity in the text.
        2. Group occurrences by their mask.
        
        Use these hints to determine the entity’s grammatical gender and case in all its occurrences.

        Ensure to cover all entities and their occurrences in the text.
      '''
    },
    {
      "role": "user",
      "content": '''
        Please, detect all entities from this list: {entities}.
        Court decision: 
        
        ---

        {text}

        ---
        
        Only output valid JSON with no additional commentary.

      '''
    }
  ]

In [60]:
oneshot_messages = [
    {
        "role": "system",
        "content": '''
        You are a professional linguist trained to determine the grammatical gender and grammatical case of entities in a Ukrainian text.

        You will receive:
        1. A list of entities and the number of their occurrences in the text.
        2. A text in Ukrainian containing an entity labeled as 'ОСОБА_'.

        Your task is to identify the grammatical gender and case of this entity, considering:
        1. The context of the text. E.g., a person can be a parent, verbs can have endings that indicate the gender.
        2. Interactions with other entities.
        3. Dependencies on other words and entities.
        4. Words surrounding the entity.

        Possible grammatical genders: 'masculine', 'feminine', 'any'. Any means that you can't determine but it can be either masculine or feminine.
        Possible cases:
        1. 'nominative' for the subject. Answers the question 'хто? що?'
        2. 'genitive' for possession. Answers the question 'кого? чого?'
        3. 'dative' for the indirect object. Answers the question 'кому? чому?'
        4. 'accusative' for the direct object. Answers the question 'кого? що?'
        5. 'instrumental' for the means. Answers the question 'ким? чим?'
        6. 'locative' for location. Answers the question 'на кому? на чому?'
        7. 'vocative' for addressing someone.

        Rules:
        1. Make sure to cover all occurrences of the entity in the text.
        2. Group occurrences by their mask.

        Here is an example response based on a similar case:
        For text:
        Справа № 303/12352/23
        2/303/1929/23

        УХВАЛА
        про повернення позовної заяви 

        Суддя Мукачівського міськрайонного суду Закарпатської області Заболотний А.М., розглянувши матеріали за позовом ОСОБА_1 до ОСОБА_2 про розірвання шлюбу,

        Response will be:
        {"cases":[{"mask":"ОСОБА_1","grammatical_cases":["nominative"]},{"mask":"ОСОБА_2","grammatical_cases":["genitive"]}],"genders":[{"grammatical_gender":"masculine","mask":"ОСОБА_1"},{"grammatical_gender":"feminine","mask":"ОСОБА_2"}]}

        Ensure your response follows this format and provides the gender and case for each occurrence of the entity.
        '''
    },
    {
        "role": "user",
        "content": '''
        Entities in this text: ОСОБА_1 occurres 5 time(s), ОСОБА_2 occurres 3 time(s).
        Court decision: 
        
        ---

        Справа № 303/12352/23
        2/303/1929/23

        УХВАЛА
        про повернення позовної заяви 

        02 січня 2024 року
        м. Мукачево

        Суддя Мукачівського міськрайонного суду Закарпатської області Заболотний А.М., розглянувши матеріали за позовом ОСОБА_1 до ОСОБА_2 про розірвання шлюбу,-

        в с т а н о в и в:

        20.12.2023 року ОСОБА_1 звернувся до суду з позовом до ОСОБА_2 про розірвання шлюбу.
        Ухвалою судді від 21.12.2023 року позовну заяву залишено без руху для усунення недоліків протягом п`яти днів з дня отримання цієї ухвали. 
        Копію ухвали від 21.12.2023 року представником позивача, адвокатом Баняс В.В., було отримано особисто 21.12.2023 року про, що свідчить наявна в матеріалах справи розписка.
        В подальшому, 26.12.2023 року від представника позивача  ОСОБА_1 , адвоката Баняс В.В. надійшла заява в якій він відкликає позовну заяву та просить повернути йому документи. 
        Враховуючи вищевикладене, а також те, що провадження у цій справі не відкрито, позовну заяву ОСОБА_1 до ОСОБА_2 про розірвання шлюбу слід повернути позивачу.
        
        Позовну заяву  ОСОБА_1 до  ОСОБА_2 про розірвання шлюбу повернути позивачу.

        ---

        Only output valid JSON with no additional commentary.
        '''
    },
    {
        "role": "system",
        "content": '{"cases":[{"mask":"ОСОБА_1","grammatical_cases":["genitive","nominative","genitive","genitive","genitive"]},{"mask":"ОСОБА_2","grammatical_cases":["genitive","genitive","genitive"]}],"genders":[{"grammatical_gender":"masculine","mask":"ОСОБА_1"},{"grammatical_gender":"feminine","mask":"ОСОБА_2"}]}'
    },
    {
      "role": "user",
      "content": '''
        Please, detect all entities from this list: {{entities}}.
        Court decision: 
        
        ---

        {{text}}

        ---

        Only output valid JSON with no additional commentary.
      '''
    }
]

In [61]:
y_pred = run_prediction(oneshot_messages, GendersWithCasesSchema, limit=1, verbose=True)

{'entities': 'ОСОБА_1, ОСОБА_2, ОСОБА_3, ОСОБА_4, ОСОБА_5', 'text': '\n\nСправа № 685/5/24\nПровадження № 2/685/77/24 \n\nУ Х В А Л А\n(про залишення позовної заяви без руху)\n\n\xa0 \xa0 \xa0 \xa002 січня 2024 року \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0\n\xa0 \xa0 \xa0 \xa0 Суддя Теофіпольського районного суду Хмельницької області Самойлович А.П., розглянувши матеріали цивільної справи за позовом ОСОБА_1 ( АДРЕСА_1 ) до ОСОБА_2 ( АДРЕСА_1 ), третя особа: Виконавчий комітет Теофіпольської селищної ради (30602, вул.Небесної Сотні, 19, смт Теофіполь, Хмельницького району, Хмельницької області)\xa0 про позбавлення батьківських прав, \n\n\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 В С Т А Н О В И В :\n\n02 сі

In [62]:
from typing import Dict, List


def _lcs_length(seq1: List[str], seq2: List[str]) -> int:
    m, n = len(seq1), len(seq2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(m):
        for j in range(n):
            if seq1[i] == seq2[j]:
                dp[i + 1][j + 1] = dp[i][j] + 1
            else:
                dp[i + 1][j + 1] = max(dp[i + 1][j], dp[i][j + 1])
    return dp[m][n]

def _evaluate_f1_ordered_metrics(true_map: Dict[str, List[str]], ai_map: Dict[str, List[str]]):
    # We gather all masks, ensuring we consider missing ones from either side
    all_masks = set(true_map.keys()) | set(ai_map.keys())
    precisions, recalls = [], []

    for mask in all_masks:
        tlist = true_map.get(mask, [])
        alist = ai_map.get(mask, [])

        # LCS length measures how many items match in correct order
        lcs = _lcs_length(tlist, alist)

        total_true = len(tlist)
        total_ai = len(alist)

        precision = lcs / total_ai if total_ai else 0.0
        recall = lcs / total_true if total_true else 0.0

        precisions.append(precision)
        recalls.append(recall)

    avg_precision = sum(precisions) / len(precisions) if precisions else 0.0
    avg_recall = sum(recalls) / len(recalls) if recalls else 0.0
    f1_score = (2 * avg_precision * avg_recall / (avg_precision + avg_recall)
                if (avg_precision + avg_recall) else 0.0)

    return {
        "precision": avg_precision,
        "recall": avg_recall,
        "f1_score": f1_score
    }

def evaluate_f1(true_data: GendersWithCasesSchema, ai_data: GendersWithCasesSchema):
    # Store the sequences for each mask so we keep duplicates AND order
    true_cases = {c.mask: list(c.grammatical_cases) for c in true_data.cases}
    ai_cases   = {c.mask: list(c.grammatical_cases) for c in ai_data.cases}

    # If you truly want to handle multiple occurrences of gender in order, store them as a list
    # Typically there's just one gender, but we'll do it the same way for consistency
    true_genders = {g.mask: [g.grammatical_gender] for g in true_data.genders}
    ai_genders   = {g.mask: [g.grammatical_gender] for g in ai_data.genders}

    cases_metrics   = _evaluate_f1_ordered_metrics(true_cases, ai_cases)
    genders_metrics = _evaluate_f1_ordered_metrics(true_genders, ai_genders)

    return {
        "cases": cases_metrics,
        "genders": genders_metrics
    }


In [63]:
# calculate avg f1 scores for cases and gender separately for each model
def calculate_f1_scores(models, test_data, true_labels, y_pred):
    """
    Returns a dictionary containing average F1 metrics for cases and genders, 
    computed across the provided set of models, test_data, and predictions.
    """
    f1_scores = {}
    for model in models:
        avg_f1_scores = []
        for idx in range(len(test_data)):
            true_data = true_labels[idx]
            ai_data = y_pred[model][idx]
            avg_f1_scores.append(evaluate_f1(true_data, ai_data))

        f1_scores[model] = {
            "cases": {
                "precision": sum(score["cases"]["precision"] for score in avg_f1_scores) / len(avg_f1_scores),
                "recall": sum(score["cases"]["recall"] for score in avg_f1_scores) / len(avg_f1_scores),
                "f1_score": sum(score["cases"]["f1_score"] for score in avg_f1_scores) / len(avg_f1_scores)
            },
            "genders": {
                "precision": sum(score["genders"]["precision"] for score in avg_f1_scores) / len(avg_f1_scores),
                "recall": sum(score["genders"]["recall"] for score in avg_f1_scores) / len(avg_f1_scores),
                "f1_score": sum(score["genders"]["f1_score"] for score in avg_f1_scores) / len(avg_f1_scores)
            },
        }

    return f1_scores


In [64]:
y_pred = run_prediction(zeroshot_messages, GendersWithCasesSchema, limit=10, verbose=False)
scores = calculate_f1_scores(models, test_data, true_labels, y_pred)
print(json.dumps(scores, indent=2))

{'entities': 'ОСОБА_1, ОСОБА_2, ОСОБА_3, ОСОБА_4, ОСОБА_5', 'text': '\n\nСправа № 685/5/24\nПровадження № 2/685/77/24 \n\nУ Х В А Л А\n(про залишення позовної заяви без руху)\n\n\xa0 \xa0 \xa0 \xa002 січня 2024 року \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0\n\xa0 \xa0 \xa0 \xa0 Суддя Теофіпольського районного суду Хмельницької області Самойлович А.П., розглянувши матеріали цивільної справи за позовом ОСОБА_1 ( АДРЕСА_1 ) до ОСОБА_2 ( АДРЕСА_1 ), третя особа: Виконавчий комітет Теофіпольської селищної ради (30602, вул.Небесної Сотні, 19, смт Теофіполь, Хмельницького району, Хмельницької області)\xa0 про позбавлення батьківських прав, \n\n\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 В С Т А Н О В И В :\n\n02 сі

KeyboardInterrupt: 

In [36]:
try:
    y_pred = run_prediction(oneshot_messages, GendersWithCasesSchema, limit=10, verbose=False)
    scores = calculate_f1_scores(models, test_data, true_labels, y_pred)
    print(json.dumps(scores, indent=2))
except KeyboardInterrupt:
    print("KeyboardInterrupt")


{'entities': 'ОСОБА_1, ОСОБА_2, ОСОБА_3, ОСОБА_4, ОСОБА_5', 'text': '\n\nСправа № 685/5/24\nПровадження № 2/685/77/24 \n\nУ Х В А Л А\n(про залишення позовної заяви без руху)\n\n\xa0 \xa0 \xa0 \xa002 січня 2024 року \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0\n\xa0 \xa0 \xa0 \xa0 Суддя Теофіпольського районного суду Хмельницької області Самойлович А.П., розглянувши матеріали цивільної справи за позовом ОСОБА_1 ( АДРЕСА_1 ) до ОСОБА_2 ( АДРЕСА_1 ), третя особа: Виконавчий комітет Теофіпольської селищної ради (30602, вул.Небесної Сотні, 19, смт Теофіполь, Хмельницького району, Хмельницької області)\xa0 про позбавлення батьківських прав, \n\n\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 В С Т А Н О В И В :\n\n02 сі

# Here we will split our task into two. A separate task for gender and a separate task for cases.

And also, we will do a few optimizations:
1. Don't send all court case text, instead only show sentence with the entity.
2. One prompt will only try to recognize single entity.

In [18]:
def extract_sentences_with_entity(text: str, entity: str) -> str:
    """
    Extract the sentence with the entity from the text.
    """
    sentences = sent_tokenize(text)
    sentences_with_entity = []
    for sentence in sentences:
        if entity in sentence:
            sentences_with_entity.append(sentence)
    return "\n".join(sentences_with_entity)


print(extract_sentences_with_entity("Це текст з двох речень. В ньому згадано ОСОБА_1 і ОСОБА_2.", "ОСОБА_1"))

В ньому згадано ОСОБА_1 і ОСОБА_2.


In [60]:
zeroshot_messages_gender = [
    {
        "role": "system",
        "content": '''
            You are a professional linguist.
            Your task will be to detect the grammatical gender of the entity in the text.
        '''
    },
    {
        "role": "user",
        "content": '''
            Here is the text:
            {{text}}

            Here is the entity that you need to detect the gender for:
            {{entity}}

            Explain by a quote from the text less than 50 words.
        '''
    }
]

In [33]:
gender_true_labels = []

for i in range(len(true_labels)):
    for gender in true_labels[i].genders:
        gender_true_labels.append(gender.grammatical_gender)

In [68]:
def evaluate_predictions(y_true_labels, y_pred_labels):
    print(classification_report(y_true_labels, y_pred_labels, zero_division=0))

def predict_gender(model, template, schema, limit=None):
    y_pred = {}

    for idx, data in enumerate(test_data if limit is None else test_data[:limit]):
        entities =[entity.mask for entity in true_labels[idx].genders]
        y_pred[idx] = {}
        for entity in entities:
            text = extract_sentences_with_entity(data, entity)
            rendered = {
                "text": text,
                "entity": entity
            }
            messages = resolve_fstrings(template, rendered)
            
            response = predict(model, messages, schema)
            y_pred[idx][entity] = response

    return y_pred

class GenderOnlySchema(BaseModel):
    grammatical_gender: Literal["masculine", "feminine", "any"]
    why: str

def collect_y_pred_labels(y_pred):
    y_pred_labels = []
    for idx in y_pred:
        for entity in y_pred[idx]:
            y_pred_labels.append(y_pred[idx][entity].grammatical_gender)
    return y_pred_labels

for model in models:
    print(model)
    gender_y_pred = predict_gender(model, zeroshot_messages_gender, GenderOnlySchema, limit=10)
    gender_y_pred_labels = collect_y_pred_labels(gender_y_pred)
    evaluate_predictions(gender_true_labels[:len(gender_y_pred_labels)], gender_y_pred_labels)


llama-3.2-8b
              precision    recall  f1-score   support

         any       0.00      0.00      0.00         6
    feminine       0.59      1.00      0.74        10
   masculine       0.75      0.67      0.71         9

    accuracy                           0.64        25
   macro avg       0.45      0.56      0.48        25
weighted avg       0.51      0.64      0.55        25

llama-3.3-70b
              precision    recall  f1-score   support

         any       1.00      0.17      0.29         6
    feminine       1.00      0.90      0.95        10
   masculine       0.60      1.00      0.75         9

    accuracy                           0.76        25
   macro avg       0.87      0.69      0.66        25
weighted avg       0.86      0.76      0.72        25

mistral-nemo
              precision    recall  f1-score   support

         any       0.38      0.50      0.43         6
    feminine       0.88      0.70      0.78        10
   masculine       0.78      0.78  

# Recognize cases separately

In order to recognizes grammatical cases, we are going to simplify the task for the model by:
1. Only recognize one entity at a time. If there are multiple occurrences of the entity, we are going to replace ОСОБА_1 with ОСОБА_1_001, ОСОБА_1_002, etc.
2. Only showing sentences with the entity.

In [72]:
def disambiguate_entities(data: str) -> tuple[str, dict[str, list[str]]]:
    # We'll match any token that looks like an entity: e.g. word_digits (e.g. ОСОБА_1)
    pattern = r'\bОСОБА_\d+\b'
    counters = {}
    occurrences = {}

    def replacement(match):
        ent = match.group(0)
        # Initialize counter for new entities.
        if ent not in counters:
            counters[ent] = 1
            occurrences[ent] = []
        new_ent = f"{ent}_{counters[ent]:03d}"
        occurrences[ent].append(new_ent)
        counters[ent] += 1
        return new_ent

    modified_text = re.sub(pattern, replacement, data)
    return modified_text, occurrences

def extract_sentences_for_entities(text: str, occurrences: dict[str, list[str]]) -> dict[str, list[str]]:
    sentences = sent_tokenize(text)
    # Flatten all disambiguated tokens into one list.
    all_tokens = [token for tokens in occurrences.values() for token in tokens]
    result = {token: [] for token in all_tokens}
    for sentence in sentences:
        for token in all_tokens:
            if token in sentence:
                result[token].append(sentence)
    return result

# Example usage:
data = test_data[0]
modified_text, occ = disambiguate_entities(data)
sentences_by_entity = extract_sentences_for_entities(modified_text, occ)

print("Modified Text:")
print(occ)
print(sentences_by_entity)

Modified Text:
{'ОСОБА_1': ['ОСОБА_1_001', 'ОСОБА_1_002', 'ОСОБА_1_003'], 'ОСОБА_2': ['ОСОБА_2_001', 'ОСОБА_2_002', 'ОСОБА_2_003'], 'ОСОБА_3': ['ОСОБА_3_001'], 'ОСОБА_4': ['ОСОБА_4_001'], 'ОСОБА_5': ['ОСОБА_5_001']}
{'ОСОБА_1_001': ['\n\nСправа № 685/5/24\nПровадження № 2/685/77/24 \n\nУ Х В А Л А\n(про залишення позовної заяви без руху)\n\n\xa0 \xa0 \xa0 \xa002 січня 2024 року \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0\n\xa0 \xa0 \xa0 \xa0 Суддя Теофіпольського районного суду Хмельницької області Самойлович А.П., розглянувши матеріали цивільної справи за позовом ОСОБА_1_001 ( АДРЕСА_1 ) до ОСОБА_2_001 ( АДРЕСА_1 ), третя особа: Виконавчий комітет Теофіпольської селищної ради (30602, вул.Небесної Сотні, 19, смт Теофіполь, Хмельницького району, Хмельницької області)\xa0 про позбавлення батьківських прав, \n\n\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0

In [None]:
detect_case_zeroshot_messages = [
    {
        "role": "system",
        "content": '''
            You are a professional linguist.
            Your task will be to detect the grammatical case of the entity hidden behind the mask.
            The text is going to be in Ukrainian.

            Nominative: синій кінь (blue horse)
            Genitive: синього коня (of the blue horse)
            Dative: синьому коню (to the blue horse)
            Accusative: синього коня (blue horse)
            Instrumental: синім конем (with the blue horse)
            Locative: на синьому коневі (on the blue horse)

        '''
    },
    {
        "role": "user",
        "content": '''
            Here is the text:
            {{text}}

            The mask of the entity that you need to detect the case for:
            {{entity}}

            Explain reasoning for your answer in 15 words or less.
        '''
    }
]

class CaseOnlySchema(BaseModel):
    grammatical_case: Literal["nominative", "genitive", "dative", "accusative", "instrumental", "prepositional"]
    why: str


def predict_cases(model, template, schema, limit=None):
    y_pred = {}

    for idx, data in enumerate(test_data if limit is None else test_data[:limit]):
        y_pred[idx] = {}
        modified_text, disambiguated = disambiguate_entities(data)
        sentences_by_entity = extract_sentences_for_entities(modified_text, disambiguated)
        for token, sentences in sentences_by_entity.items():
            text = "\n".join(sentences)
            rendered = {
                "text": text,
                "entity": token
            }
            messages = resolve_fstrings(template, rendered)
            response = predict(model, messages, schema)
            y_pred[idx][token] = response
    return y_pred


def evaluate_predictions(y_true_labels, y_pred_labels):
    print(classification_report(y_true_labels, y_pred_labels, zero_division=0))


def collect_y_pred_labels(y_pred):
    y_pred_labels = []
    for idx in y_pred:
        for entity in y_pred[idx]:
            y_pred_labels.append(y_pred[idx][entity].grammatical_case)
    return y_pred_labels

def collect_y_true_labels(y_true_labels):
    y_true_labels_labels = []
    for sample_idx in y_true_labels:
        for cases in y_true_labels[sample_idx].cases:
            for case in cases.grammatical_cases:
                y_true_labels_labels.append(case)
    return y_true_labels_labels


for model in models:
# model = "gpt-4o-mini"
    y_pred = predict_cases(model, detect_case_zeroshot_messages, CaseOnlySchema, limit=10)
    y_pred_labels = collect_y_pred_labels(y_pred)
    y_true_labels = collect_y_true_labels(true_labels)
    print(model)
    evaluate_predictions(y_true_labels[:len(y_pred_labels)], y_pred_labels)


In [None]:
            # 2. Agreement between Numerals and Nouns:

            # "Два", "три", "чотири" (two, three, four): These numerals require the noun in the nominative plural.

            # два стільці (two chairs)
            # три книги (three books)
            # чотири яблука (four apples)
            # "П'ять" and above: Use the genitive plural of the noun.

            # п'ять стільців (five chairs)
            # десять книг (ten books)
            # двадцять яблук (twenty apples)
            # Compound Numerals: The noun agrees with the last numeral.

            # двісті тридцять шість стільців (two hundred thirty-six chairs)
            # Fractional Numerals: The noun is in the genitive singular.

            # три п'ятих частини (three-fifths of the parts)
            # 3. Agreement between Pronouns and Nouns:

            # Gender and Number: Pronouns must match the noun in gender and number.

            # Masculine singular: його книга (his book)
            # Feminine singular: її книга (her book)
            # Neuter singular: його вікно (its window)
            # Plural: їхні книги (their books)
            # Case: Pronouns change according to the noun's case.

            # Nominative: його книга (his book)
            # Genitive: його книги (of his book)
            # Dative: його книзі (to his book)
            # Accusative: його книгу (his book)
            # Instrumental: його книгою (with his book)
            # Locative: на його книзі (on his book)
            # 4. Agreement between Participles and Nouns:

            # Gender and Number: Participles must match the noun in gender and number.

            # Masculine singular: читаючий студент (reading student)
            # Feminine singular: читаюча студентка (reading female student)
            # Neuter singular: читаюче вікно (reading window)
            # Plural: читаючі студенти (reading students)
            # Case: Participles change according to the noun's case.

            # Nominative: читаючий студент (reading student)
            # Genitive: читаючого студента (of the reading student)
            # Dative: читаючому студенту (to the reading student)
            # Accusative: читаючого студента (reading student)
            # Instrumental: читаючим студентом (with the reading student)
            # Locative: на читаючому студенті (on the reading student)
            # 5. Agreement between Verbs and Subjects:

            # Number: Verbs must match the subject in number.

            # Singular: студент читає (the student reads)
            # Plural: студенти читають (the students read)
            # Gender: For past tense verbs, they must match the subject in gender.

            # Masculine singular: студент читав (the student read)
            # Feminine singular: студентка читала (the female student read)
            # Plural: студенти читали (the students read)
            # 6. Agreement between Adverbs and Other Parts of Speech:

            # Adverbs do not change according to case, gender, or number, so they do not agree with other parts of speech.

            # 7. Agreement between Numerals and Nouns in Complex Constructions:

            # In compound numerals, only the last numeral affects the noun's form.

            # двісті тридцять шостий день (the two hundred thirty-sixth day)
            # 8. Agreement between Nouns and Other Nouns:

            # Nouns in the genitive plural do not change according to case.

            # без друзів (without friends)