## Terminal

In [1]:
# Celda 3: Preparar texto
texto = """# Enūma Eliš
e-nu-ma e-liš la na-bu-u2 šamû
šap-liš am-ma-tum šu-ma la za-ak-rat
ZU.AB-ma reš-tu-u2 za-ru-šu-un
"""

from txt2conllu import *

with open('./demo/mi_texto.txt', 'w', encoding='utf-8') as f:
    f.write(texto)

!python txt2conllu.py --filename=demo/mi_texto.txt

> File converted to CoNLL-U+ and saved as demo/mi_texto.conllu


In [None]:


# Celda 4: Lemmatizar CON GPU (sin --use-cpu)
!python babylemmatizer.py \
    --lemmatize=lbtest2 \
    --filename=/demo/mi_texto.conllu
    # Sin --use-cpu para usar GPU P100

print("\n✓ Lemmatización con GPU completada")
!ls -lh demo/mi_texto_*.conllu




‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›

   BabyLemmatizer 2.2

   A. Aleksi Sahala 2023-2024
      + https://github.com/asahala

   University of Helsinki
      + Origins of Emesal Project
      + Centre of Excellence for Ancient Near-Eastern Empires

‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›‹<>›


✓ Lemmatización con GPU completada


Traceback (most recent call last):
  File "c:\Users\ochoa\projects\babel\BabyLemmatizer\babylemmatizer.py", line 124, in <module>
    lemmatizer = lemmatizer_pipeline.Lemmatizer(
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ochoa\projects\babel\BabyLemmatizer\lemmatizer_pipeline.py", line 84, in __init__
    os.mkdir(step_path)
FileNotFoundError: [WinError 3] The system cannot find the path specified: '/demo\\steps'
'ls' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
# Celda 5: Analizar resultados
import pandas as pd

with open('./demo/mi_texto.conllu', 'r', encoding='utf-8') as f:
    lines = [l.strip() for l in f if l.strip() and not l.startswith('#')]

data = []
for line in lines:
    fields = line.split('\t')
    if len(fields) >= 5:
        data.append({
            'FORM': fields[1],
            'LEMMA': fields[2],
            'XPOS': fields[4],
            'SCORE': fields[15] if len(fields) > 15 else '_'
        })

df = pd.DataFrame(data)
print(df)

           FORM LEMMA XPOS SCORE
0       e-nu-ma     _    _     _
1         e-liš     _    _     _
2            la     _    _     _
3      na-bu-u₂     _    _     _
4          šamû     _    _     _
5       šap-liš     _    _     _
6     am-ma-tum     _    _     _
7         šu-ma     _    _     _
8            la     _    _     _
9     za-ak-rat     _    _     _
10     ZU.AB-ma     _    _     _
11    reš-tu-u₂     _    _     _
12  za-ru-šu-un     _    _     _


---

    Ejemplos de uso de BabyLemmatizer como librería
    con soporte para archivos y objetos en memoria

In [4]:
from txt2conllu import txt_lines_to_conllu
from lemmatizer_pipeline import Lemmatizer

## Uso clásico (archivo → archivo)

## Librería pura (lista → objeto)

In [5]:
def ejemplo_libreria_pura() -> None:
    """Uso como librería: todo en memoria, sin archivos"""
    print("\n=== EJEMPLO 2: Modo librería pura (lista → objeto) ===")
    
    # Texto de entrada como lista
    texto = [
        "e-nu-ma e-liš la na-bu-u₂ šamû",
        "šap-liš am-ma-tum šu-ma la za-ak-rat",
        "ZU.AB-ma reš-tu-u₂ za-ru-šu-un"
    ]
    
    # Paso 1: Convertir texto a objeto CoNLL-U
    conllu_obj = txt_lines_to_conllu(lines=texto)
    print(f"✓ Creado objeto ConlluPlus con {conllu_obj.word_count} palabras")
    
    # Paso 2: Lemmatizar en memoria (sin escribir archivos)
    lemmatizer = Lemmatizer(input_file=conllu_obj)
    resultado = lemmatizer.run_model(model_name='lbtest2', cpu=True)
    
    # Paso 3: Acceder a resultados en memoria
    for id_, form, lemma, xpos in resultado.get_contents('id', 'form', 'lemma', 'xpos'):
        print(f"{id_}\t{form}\t{lemma}\t{xpos}")
    
    return resultado

In [6]:
ejemplo_libreria_pura()


=== EJEMPLO 2: Modo librería pura (lista → objeto) ===
✓ Creado objeto ConlluPlus con 13 palabras
> Your model was trained with an old version of BabyLemmatizer.
> Using Tokenizer setting 0. Rebuild model using --tokenizer.
> Using tokenizer 0
> Your model was trained with an old version of BabyLemmatizer.
> Using default contexts
> Tagger context = 2
> Lemmatizer context = 1
> Normalizing CoNLL-U
> Updating field "formctx"
> Fetching contexts for "form"
> Generating input data for neural net (memory mode)
> Input file size: 13 words in 3 segments.
> Tagging with lbtest2
> Updating field "xpos"


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\ochoa\\AppData\\Local\\Temp\\babylem_jqyigftq\\temp.tag_pred'

## Híbrido (lista → objeto → archivo)

In [None]:
def ejemplo_hibrido():
    """Uso híbrido: procesar en memoria pero guardar archivo final"""
    print("\n=== EJEMPLO 3: Modo híbrido (lista → objeto → archivo) ===")
    
    texto = [
        "e-nu-ma e-liš la na-bu-u₂ šamû",
        "šap-liš am-ma-tum šu-ma la za-ak-rat"
    ]
    
    # Paso 1: Texto a objeto (opcionalmente también a archivo)
    conllu_obj = txt_lines_to_conllu(texto, output="mi_texto.conllu")
    print("✓ Creado objeto Y guardado mi_texto.conllu")
    
    # Paso 2: Lemmatizar y especificar archivo de salida
    lemmatizer = Lemmatizer(conllu_obj, output_file="resultado.conllu")
    resultado = lemmatizer.run_model('lbtest2', cpu=True)
    
    print("✓ Resultado guardado en resultado.conllu")
    return resultado

In [10]:
ejemplo_hibrido()


=== EJEMPLO 3: Modo híbrido (lista → objeto → archivo) ===
> File converted to CoNLL-U+ and saved as mi_texto.conllu
['e-nu-ma e-liš la na-bu-u₂ šamû', 'šap-liš am-ma-tum šu-ma la za-ak-rat']
✓ Creado objeto Y guardado mi_texto.conllu
> Your model was trained with an old version of BabyLemmatizer.
> Using Tokenizer setting 0. Rebuild model using --tokenizer.
> Using tokenizer 0
> Your model was trained with an old version of BabyLemmatizer.
> Using default contexts
> Tagger context = 2
> Lemmatizer context = 1
> Normalizing CoNLL-U
> Updating field "formctx"
> Fetching contexts for "form"
> Generating input data for neural net (memory mode)
> Input file size: 10 words in 2 segments.
> Tagging with lbtest2
> Updating field "xpos"


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\ochoa\\AppData\\Local\\Temp\\babylem_q571ajcb\\temp.tag_pred'

## Pipeline completo en memoria

In [11]:
def ejemplo_pipeline_completo():
    """Pipeline completo: texto → procesamiento → análisis"""
    print("\n=== EJEMPLO 4: Pipeline completo en memoria ===")
    
    # Entrada
    texto = [
        "e-nu-ma e-liš la na-bu-u₂ šamû",
        "šap-liš am-ma-tum šu-ma la za-ak-rat",
        "ZU.AB-ma reš-tu-u₂ za-ru-šu-un"
    ]
    
    # Paso 1: txt → conllu
    conllu = txt_lines_to_conllu(texto)
    
    # Paso 2: lemmatizar
    lem = Lemmatizer(conllu, ignore_numbers=True)
    resultado = lem.run_model('lbtest2', cpu=True)
    
    # Paso 3: análisis en memoria
    lemas_unicos = set()
    pos_tags = {}
    
    for _, form, lemma, xpos in resultado.get_contents('id', 'form', 'lemma', 'xpos'):
        if lemma != '_':
            lemas_unicos.add(lemma)
        if xpos != '_':
            pos_tags[form] = xpos
    
    print(f"\n✓ Total de lemas únicos: {len(lemas_unicos)}")
    print(f"✓ Formas con POS tag: {len(pos_tags)}")
    
    return resultado

In [12]:
ejemplo_pipeline_completo()


=== EJEMPLO 4: Pipeline completo en memoria ===
> Your model was trained with an old version of BabyLemmatizer.
> Using Tokenizer setting 0. Rebuild model using --tokenizer.
> Using tokenizer 0
> Your model was trained with an old version of BabyLemmatizer.
> Using default contexts
> Tagger context = 2
> Lemmatizer context = 1
> Normalizing CoNLL-U
> Updating field "formctx"
> Fetching contexts for "form"
> Generating input data for neural net (memory mode)
> Input file size: 13 words in 3 segments.
> Tagging with lbtest2
> Updating field "xpos"


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\ochoa\\AppData\\Local\\Temp\\babylem_e24y105q\\temp.tag_pred'

## Batch processing

In [13]:
def ejemplo_batch():
    """Procesar múltiples textos en batch"""
    print("\n=== EJEMPLO 5: Batch processing ===")
    
    textos = [
        ["e-nu-ma e-liš la na-bu-u₂ šamû"],
        ["šap-liš am-ma-tum šu-ma"],
        ["ZU.AB-ma reš-tu-u₂ za-ru-šu-un"]
    ]
    
    resultados = []
    
    for i, texto in enumerate(textos, 1):
        print(f"\nProcesando texto {i}/{len(textos)}...")
        conllu = txt_lines_to_conllu(texto)
        lem = Lemmatizer(conllu)
        resultado = lem.run_model('lbtest2', cpu=True)
        resultados.append(resultado)
    
    print(f"\n✓ Procesados {len(resultados)} textos")
    return resultados


In [14]:
ejemplo_batch()


=== EJEMPLO 5: Batch processing ===

Procesando texto 1/3...
> Your model was trained with an old version of BabyLemmatizer.
> Using Tokenizer setting 0. Rebuild model using --tokenizer.
> Using tokenizer 0
> Your model was trained with an old version of BabyLemmatizer.
> Using default contexts
> Tagger context = 2
> Lemmatizer context = 1
> Normalizing CoNLL-U
> Updating field "formctx"
> Fetching contexts for "form"
> Generating input data for neural net (memory mode)
> Input file size: 5 words in 1 segments.
> Tagging with lbtest2
> Updating field "xpos"


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\ochoa\\AppData\\Local\\Temp\\babylem_tg_w0h1l\\temp.tag_pred'

## Integración con pandas

In [15]:
def ejemplo_pandas():
    """Exportar resultados a pandas DataFrame"""
    print("\n=== EJEMPLO 6: Integración con pandas ===")
    
    import pandas as pd
    
    texto = [
        "e-nu-ma e-liš la na-bu-u₂ šamû",
        "šap-liš am-ma-tum šu-ma la za-ak-rat"
    ]
    
    # Procesar
    conllu = txt_lines_to_conllu(texto)
    lem = Lemmatizer(conllu)
    resultado = lem.run_model('lbtest2', cpu=True)
    
    # Convertir a DataFrame
    data = []
    for id_, form, lemma, xpos in resultado.get_contents('id', 'form', 'lemma', 'xpos'):
        data.append({
            'id': id_,
            'form': form,
            'lemma': lemma,
            'pos': xpos
        })
    
    df = pd.DataFrame(data)
    print("\n✓ DataFrame creado:")
    print(df)
    
    return df

In [16]:
ejemplo_pandas()


=== EJEMPLO 6: Integración con pandas ===
> Your model was trained with an old version of BabyLemmatizer.
> Using Tokenizer setting 0. Rebuild model using --tokenizer.
> Using tokenizer 0
> Your model was trained with an old version of BabyLemmatizer.
> Using default contexts
> Tagger context = 2
> Lemmatizer context = 1
> Normalizing CoNLL-U
> Updating field "formctx"
> Fetching contexts for "form"
> Generating input data for neural net (memory mode)
> Input file size: 10 words in 2 segments.
> Tagging with lbtest2
> Updating field "xpos"


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\ochoa\\AppData\\Local\\Temp\\babylem_wj6kjhkw\\temp.tag_pred'