<a href="https://colab.research.google.com/github/componavt/topkar-space/blob/main/src/ner/stanza_toponym_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🌍 Toponym Extraction & Evaluation Pipeline

This notebook extracts **geographical entities (toponyms)** from Russian text using **Stanza NER**, optionally normalizes them via **lemmatization** (`pymorphy3`), and saves results to `output.csv`. 📤  
It then **compares** extracted toponyms against a **reference annotation** (`reference_table_sample100.csv`) to compute **precision, recall, and F1-score**. 📊  
Includes detailed error analysis and exports a full comparison report. 🔍  

In [None]:
!pip install stanza
!pip install pandas
!pip install nltk
!pip install pymorphy3
!pip install re

In [None]:
import stanza

# Загружаем модель для русского языка
stanza.download('ru')  # Загрузите модель, если она еще не загружена
nlp = stanza.Pipeline('ru')

# Пример текста на русском языке
text = "Дом в дер. Койвусельга, фам. Михайлов. ="

# Обрабатываем текст
doc = nlp(text)

# Извлекаем именованные сущности
for sentence in doc.sentences:
    for entity in sentence.ents:
        print(f'Текст: {entity.text}, Тип: {entity.type}')


In [None]:
import pandas as pd

csv_files = [
#    "https://raw.githubusercontent.com/componavt/topkar-space/main/data/sample10.csv",
    "https://raw.githubusercontent.com/componavt/topkar-space/main/data/sample100.csv",
]

df = pd.concat([pd.read_csv(url, sep = ';') for url in csv_files], ignore_index=True)
df = df.reset_index()  # make sure indexes pair with number of rows
df.head()


In [None]:
import stanza

text = "Яблоко рассматривает возможность покупки стартапа в Великобритании за 1 миллиард долларов. Владимир Путин является президентом России. А ещё паста и Кижи"

doc = nlp(text)

for sentence in doc.sentences:
    for entity in sentence.ents:
        print(f'Текст: {entity.text}, Тип: {entity.type}')

In [7]:
"""
NER with lemmatizator
"""
from ast import Import
import nltk
import re
import pymorphy3
from nltk.tokenize import sent_tokenize, word_tokenize
lines = df['Text'].tolist()

morph = pymorphy3.MorphAnalyzer()

print()
print("Number of toponyms:", len(lines))
toponyms = []
df['Text'] = df['Text'].replace({float('nan'): ""})
num=0
output_csv = 'output.csv'
with open(output_csv, 'w', encoding='utf-8') as outfile:
  outfile.writelines(f"sentence_id; toponyms_list \n")
  if sentence:
    for sentence in lines:
        tex = nlp(sentence)
        print(f'\nSentence {num}: {sentence}')
        num+=1
        for entity in tex.ents:
          if entity.type == "LOC":
            print(f'Location: {entity.text}')
            parsed = morph.parse(entity.text)[0]
            normalized_word = parsed.normal_form
            # print(f'Location: {entity.text}, type: {entity.type}')
            toponyms.append(normalized_word)
            print(f'Location: {normalized_word}')
        strk = (f"{num}; {toponyms} \n")
        strk = strk.replace('[', '').replace(']', '').replace('\'', '').replace('\"', '')
        outfile.writelines(strk)
        toponyms = []

#print("Число найденных топонимов", len(toponyms))


Number of toponyms: 100

Sentence 0: Пахотная поляна в устье Пижейручья.
Location: Пижейручья
Location: пижейручей

Sentence 1: покосы на юге острова Галайский, см. Галайский Остров
Location: Галайский
Location: галайский
Location: Галайский Остров
Location: галайский остр

Sentence 2: Маленькое озерко за хутором Poh'd'ad'g', между оз. Sar'gär'v и Vougedg'är'v.
Location: Poh'd'ad'g'
Location: poh'd'ad'g'
Location: Sar'gär'v
Location: sar'gär'v
Location: Vougedg'är'v
Location: vougedg'är'v

Sentence 3: Поляны по дороге в дер. Сидорово.
Location: Поляны
Location: поляна
Location: Сидорово
Location: сидоровый

Sentence 4: Дом в дер. Койвусельга, фам. Михайлов.
Location: Михайлов
Location: михайлов

Sentence 5: Река Колпь, вытекает из оз. Jokšar'v, течет в Вологодск. обл.
Location: Колпь
Location: колпать
Location: Jokšar'v
Location: jokšar'v
Location: Вологодск
Location: вологодск

Sentence 6: Поле находится в 600-700 м на востоке д.Каскосельга.
Location: Каскосельга
Location: каскосельг

In [None]:
"""
NER without lemmatizator
"""
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
lines = df['Text'].tolist()
print()
print("Number of toponyms:", len(lines))
toponyms = []
df['Text'] = df['Text'].replace({float('nan'): ""})
num=0
output_csv = 'output.csv'
with open(output_csv, 'w', encoding='utf-8') as outfile:
  outfile.writelines(f"sentence_id; toponyms_list \n")
  for sentence in lines:
      tex = nlp(sentence)
      print(f'\nSentence {num}: {sentence}')
      num+=1
      for entity in tex.ents:
        if entity.type == "LOC":
          print(f'Location: {entity.text}')
          # print(f'Location: {entity.text}, type: {entity.type}')
          toponyms.append(entity.text)
      strk = (f"{num}; {toponyms} \n")
      strk = strk.replace('[', '').replace(']', '').replace('\'', '').replace('\"', '')
      outfile.writelines(strk)
      toponyms = []


In [9]:
"""
Comparison Analysis between output.csv and reference_table_sample100.csv
"""

import pandas as pd
import re
from collections import Counter

def clean_toponym_list(toponym_str):
    """Clean and normalize toponym lists for comparison"""
    if pd.isna(toponym_str) or toponym_str == '':
        return []

    # Split by comma and clean each toponym
    toponyms = [t.strip() for t in str(toponym_str).split(',')]
    # Remove empty strings
    toponyms = [t for t in toponyms if t]
    return toponyms

def compare_toponym_lists(list1, list2):
    """Compare two lists of toponyms and return differences"""
    set1 = set(list1)
    set2 = set(list2)

    only_in_1 = set1 - set2
    only_in_2 = set2 - set1
    common = set1 & set2

    return {
        'only_in_output': list(only_in_1),
        'only_in_reference': list(only_in_2),
        'common': list(common),
        'output_count': len(list1),
        'reference_count': len(list2),
        'common_count': len(common)
    }


# Read the datasets
print("Loading datasets...")

# Read output.csv
output_df = pd.read_csv('output.csv', sep=';')
print(f"Output dataset: {len(output_df)} rows")

# Read reference table
reference_df = pd.read_csv('reference_table_sample100.csv')
print(f"Reference dataset: {len(reference_df)} rows")

# Clean column names
output_df.columns = output_df.columns.str.strip()

print("\n" + "="*80)
print("DATASET STRUCTURE COMPARISON")
print("="*80)

print(f"\nOutput.csv columns: {list(output_df.columns)}")
print(f"Reference table columns: {list(reference_df.columns)}")

print("\n" + "="*80)
print("TOPONYM LIST COMPARISON")
print("="*80)

# Compare toponym lists
comparison_results = []

for idx in range(len(output_df)):
    output_row = output_df.iloc[idx]
    reference_row = reference_df.iloc[idx] if idx < len(reference_df) else None

    # Get toponyms from output.csv
    output_toponyms = clean_toponym_list(output_row['toponyms_list'])

    # Get toponyms from reference table
    if reference_row is not None:
        reference_toponyms = clean_toponym_list(reference_row['Toponim_list'])
    else:
        reference_toponyms = []

    # Compare the lists
    comparison = compare_toponym_lists(output_toponyms, reference_toponyms)
    comparison['sentence_id'] = output_row['sentence_id']
    comparison['output_text'] = output_row['toponyms_list']
    comparison['reference_text'] = reference_row['Toponim_list'] if reference_row is not None else ''

    comparison_results.append(comparison)

# Create summary statistics
total_only_in_output = sum(len(r['only_in_output']) for r in comparison_results)
total_only_in_reference = sum(len(r['only_in_reference']) for r in comparison_results)
total_common = sum(r['common_count'] for r in comparison_results)
total_output = sum(r['output_count'] for r in comparison_results)
total_reference = sum(r['reference_count'] for r in comparison_results)

print(f"\nSUMMARY STATISTICS:")
print(f"Total toponyms in output.csv: {total_output}")
print(f"Total toponyms in reference table: {total_reference}")
print(f"Common toponyms: {total_common}")
print(f"Only in output.csv: {total_only_in_output}")
print(f"Only in reference table: {total_only_in_reference}")

# Calculate accuracy metrics
if total_reference > 0:
    precision = total_common / total_output if total_output > 0 else 0
    recall = total_common / total_reference
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"\nACCURACY METRICS:")
    print(f"Precision: {precision:.3f} ({precision*100:.1f}%)")
    print(f"Recall: {recall:.3f} ({recall*100:.1f}%)")
    print(f"F1-Score: {f1_score:.3f} ({f1_score*100:.1f}%)")

# Find rows with significant differences
print(f"\n" + "="*80)
print("ROWS WITH SIGNIFICANT DIFFERENCES")
print("="*80)

significant_diffs = []
for r in comparison_results:
    if len(r['only_in_output']) > 0 or len(r['only_in_reference']) > 0:
        significant_diffs.append(r)

print(f"Found {len(significant_diffs)} rows with differences")

# Show first 10 significant differences
for i, diff in enumerate(significant_diffs[:10]):
    print(f"\nRow {diff['sentence_id']}:")
    print(f"  Output: '{diff['output_text']}'")
    print(f"  Reference: '{diff['reference_text']}'")
    if diff['only_in_output']:
        print(f"  Only in output: {diff['only_in_output']}")
    if diff['only_in_reference']:
        print(f"  Only in reference: {diff['only_in_reference']}")
    if diff['common']:
        print(f"  Common: {diff['common']}")

# Analyze most common differences
print(f"\n" + "="*80)
print("MOST COMMON DIFFERENCES")
print("="*80)

all_only_in_output = []
all_only_in_reference = []

for r in comparison_results:
    all_only_in_output.extend(r['only_in_output'])
    all_only_in_reference.extend(r['only_in_reference'])

print(f"\nMost frequent toponyms only in output.csv:")
output_counter = Counter(all_only_in_output)
for toponym, count in output_counter.most_common(10):
    print(f"  '{toponym}': {count} times")

print(f"\nMost frequent toponyms only in reference table:")
reference_counter = Counter(all_only_in_reference)
for toponym, count in reference_counter.most_common(10):
    print(f"  '{toponym}': {count} times")

# Save detailed comparison to CSV
comparison_df = pd.DataFrame(comparison_results)
comparison_df.to_csv('detailed_comparison.csv', index=False)
print(f"\nDetailed comparison saved to 'detailed_comparison.csv'")

Loading datasets...
Output dataset: 100 rows


FileNotFoundError: [Errno 2] No such file or directory: 'reference_table_sample100.csv'