In [4]:
# text = "子供は公園で遊んだり、家で勉強したりします。"
text = "食べさせられました。この車は速くありませんでした。怒ってしまった。"

In [None]:
from tabulate import tabulate

# SudachiPy

In [None]:
from sudachipy import dictionary, tokenizer
import pandas as pd

tokenizer_obj = dictionary.Dictionary(dict="full").create()
mode = tokenizer.Tokenizer.SplitMode.C

morphemes = tokenizer_obj.tokenize(text, mode)

data = []
for m in morphemes:
    data.append({
        "surface": m.surface(),
        "dictionary_form": m.dictionary_form(),
        "normalized_form": m.normalized_form(),
        "part_of_speech": m.part_of_speech(),
        "reading_form": m.reading_form(),
        "begin": m.begin(),
        "end": m.end(),
        "is_oov": m.is_oov(),
        "dictionary_id": m.dictionary_id(),
        "part_of_speech_id": m.part_of_speech_id(),
        "word_id": m.word_id(),
        "synonym_group_ids": m.synonym_group_ids(),
        "raw_surface": m.raw_surface()
    })

# print(tabulate(pd.DataFrame(data), headers="keys", tablefmt="github", showindex=False))
pd.DataFrame(data)

# Stanza

In [None]:
import pandas as pd
import stanza
from nltk import Tree

nlp = stanza.Pipeline(lang="ja", processors='tokenize, lemma, pos, constituency', model_dir=f"../toy_content/_stanza_resources", logging_level='WARN')

doc = nlp(text)

for sentence in doc.sentences:
    tokens = [word.to_dict() for word in sentence.tokens]
    print(sum(tokens, []))

    display(pd.DataFrame(sum(tokens, [])))
    # print(tabulate(pd.DataFrame(sum(tokens, [])), headers="keys", tablefmt="github", showindex=False))

    tree = Tree.fromstring(str(sentence.constituency))
    tree.pretty_print()   # Pretty text-based tree


# Mecab

In [None]:
import subprocess

result = subprocess.run(['mecab'], input=text, text=True, capture_output=True)
lines = result.stdout.strip().split('\n')
rows = []
for line in lines:
    if line == 'EOS' or not line:
        continue
    cols = line.split('\t')
    if len(cols) < 2:
        continue
    surface = cols[0]
    features = cols[1].split(',')
    row = {
        "surface": surface,
        "pos": features[0] if len(features) > 0 else "",
        "pos_detail1": features[1] if len(features) > 1 else "",
        "pos_detail2": features[2] if len(features) > 2 else "",
        "pos_detail3": features[3] if len(features) > 3 else "",
        "conjugation_form": features[4] if len(features) > 4 else "",
        "conjugation_type": features[5] if len(features) > 5 else "",
        "base_form": features[6] if len(features) > 6 else "",
        "reading": features[7] if len(features) > 7 else "",
        "pronunciation": features[8] if len(features) > 8 else "",
    }
    rows.append(row)

display(pd.DataFrame(rows))
print(tabulate(pd.DataFrame(rows), headers="keys", tablefmt="github", showindex=False))

# Fugashi

In [None]:
import fugashi
import unidic
tagger = fugashi.Tagger('-d ' + unidic.DICDIR)

In [None]:
tagger.parse(text)
for word in tagger(text):
    print(word, word.feature.lemma, word.pos, sep='\t')

In [None]:
def analyze_verb_conjugation(text):
    words = list(tagger(text))
    analysis = {
        "input": text,
        "dictionary_form": None,
        "conjugation_chain": [],
        "segments": []
    }

    for word in words:
        pos = word.pos
        lemma = word.feature.lemma
        inflection_type = word.feature.cType  # 活用型
        inflection_form = word.feature.cForm  # 活用形

        if '動詞' in pos or '助動詞' in pos:
            # Add to conjugation chain
            segment_info = {
                "surface": word.surface,
                "lemma": lemma,
                "pos": pos,
                "conjugation_type": inflection_type,
                "conjugation_form": inflection_form
            }
            analysis["segments"].append(segment_info)

            if analysis["dictionary_form"] is None:
                analysis["dictionary_form"] = lemma

            if inflection_form and inflection_form != "*":
                analysis["conjugation_chain"].append({
                    "type": inflection_type,
                    "form": inflection_form,
                    "surface": word.surface
                })

    return analysis

# Test examples
test_inputs = [
    "食べさせられました",   # causative passive past polite
    "読まなかった",         # negative past
    "行っている",           # te-form + progressive
    "見られる",             # potential/passive
    "書かせていただきます" # causative + humble polite
]

for text in test_inputs:
    print("=" * 50)
    result = analyze_verb_conjugation(text)
    print(f"Input: {result['input']}")
    print(f"Dictionary form: {result['dictionary_form']}")
    print("Conjugation chain:")
    for c in result["conjugation_chain"]:
        print(f"  - {c['surface']} ({c['type']} - {c['form']})")


In [None]:
import fugashi
import unidic

tagger = fugashi.Tagger('-d ' + unidic.DICDIR)
text = "食べさせられました"

words = list(tagger(text))

# Base dictionary form and first surface
dictionary_form = words[0].feature.lemma

# Map auxiliaries to grammatical functions
AUX_MEANINGS = {
    'させる': 'causative',
    'られる': 'passive',
    'ます': 'polite',
    'た': 'past',
    'ない': 'negative',
    'たい': 'desire',
    'う': 'volitional',
}

# Start with empty conjugated string
conjugated = ""
chain = []

for i, word in enumerate(words):
    surface = word.surface
    lemma = word.feature.lemma
    cType = word.feature.cType
    cForm = word.feature.cForm

    conjugated += surface

    # Find grammatical function if auxiliary (except main verb)
    gram_func = ""
    if i > 0 and lemma in AUX_MEANINGS:
        gram_func = AUX_MEANINGS[lemma]

    chain.append({
        'conjugated': conjugated,
        'surface': surface,
        'lemma': lemma,
        'cType': cType,
        'cForm': cForm,
        'function': gram_func,
    })

# Output
print(f"Input: {text}")
print(f"Dictionary form: {dictionary_form}")
print("Conjugation chain with cumulative forms:")

for step in chain:
    func_str = f" - {step['function']}" if step['function'] else ""
    print(f"  - {step['conjugated']} ({step['lemma']} - {step['cType']} - {step['cForm']}){func_str}")


# Spacy

In [None]:
import spacy
import pykakasi
japanese_nlp = spacy.load("ja_core_news_sm", disable = ['ner', 'parser'])
# japanese_nlp.add_pipe("custom_sentence_splitter", first=True)
japanese_nlp.add_pipe('sentencizer')
hiraganaConverter = pykakasi.kakasi()

doc = japanese_nlp(text)

In [None]:
for sentenceIndex, sentence in enumerate(doc.sents):
    for token in sentence:
        print("---")
        print(token)
        print(token.lemma_)

        reading = list()
        lemmaReading = list()
        if True or language == 'japanese':
            result = hiraganaConverter.convert(token.text)
            for x in result:
                reading.append(x['hira'])
            
            result = hiraganaConverter.convert(token.lemma_)
            for x in result:
                lemmaReading.append(x['hira'])
        
            reading = ''.join(reading)
            lemmaReading = ''.join(lemmaReading)
            
        print(reading)
        print(lemmaReading)

# Ginza via Spacy

In [None]:
import spacy
nlp = spacy.load('ja_ginza')
doc = nlp(text)

# available token attributes https://spacy.io/api/token#attributes
for sent in doc.sents:
    for token in sent:
        print(
            token.i,
            token.orth_,
            token.lemma_,
            token.norm_,
            token.text,
            token.head,
            token.morph.get("Reading"),
            token.pos_,
            token.morph.get("Inflection"),
            token.tag_,
            token.dep_,
            token.head.i,
        )
    print('EOS')

In [None]:
for p in nlp.pipeline:
    print(p)

In [None]:
spacy.displacy.render(doc, style='ent')

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
from spacy.matcher import PhraseMatcher
nlp = spacy.load("en_core_web_sm")

for p in nlp.pipeline:
    print(p)
    
def on_match(matcher, doc, id, matches):
    print('Matched!', matches)

matcher = PhraseMatcher(nlp.vocab)
matcher.add("OBAMA", [nlp("Barack Obama")], on_match=on_match)
matcher.add("OBAMAU", [nlp("Barack Obama urges")], on_match=on_match)
matcher.add("FAREWELL", [nlp("emotional farewell"), nlp("emotional farewells")], on_match=on_match)
doc = nlp("Barack Obama lifts America one last time in emotional farewell")
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)