In [2]:
import spacy
from sudachipy import dictionary, tokenizer
from spacy.tokens import Doc

# 1. Standard pipeline (for reliable POS/DEP):
std_nlp = spacy.load("ja_core_news_sm")

# 2. Alternate segmentation pipeline (no tagging):
sudachi = dictionary.Dictionary().create()
MODE = tokenizer.Tokenizer.SplitMode.A  # A=short, C=long

alt_nlp = spacy.blank("ja")

def sudachi_tokenizer_func(text):
    ms = sudachi.tokenize(text, MODE)
    words = [m.surface() for m in ms]
    spaces = [False]*len(words)
    return Doc(alt_nlp.vocab, words=words, spaces=spaces)

alt_nlp.tokenizer = sudachi_tokenizer_func

text = "彼は昨日本を買って読み始めました。"
doc_std = std_nlp(text)
doc_alt = alt_nlp(text)

print("STD tokens:", [t.text for t in doc_std])
print("ALT tokens (mode A):", [t.text for t in doc_alt])
print("STD POS:", [t.pos_ for t in doc_std])

STD tokens: ['彼', 'は', '昨', '日本', 'を', '買っ', 'て', '読み', '始め', 'まし', 'た', '。']
ALT tokens (mode A): ['彼', 'は', '昨', '日本', 'を', '買っ', 'て', '読み', '始め', 'まし', 'た', '。']
STD POS: ['PRON', 'ADP', 'NOUN', 'PROPN', 'ADP', 'VERB', 'SCONJ', 'VERB', 'VERB', 'AUX', 'AUX', 'PUNCT']


In [1]:
import spacy
from sudachipy import tokenizer, dictionary
from spacy.tokens import Doc

sudachi = dictionary.Dictionary().create()
MODE = tokenizer.Tokenizer.SplitMode.C  # change to A for short proxy

nlp = spacy.blank("ja")

def sudachi_tokenizer_func(text):
    sudachi_tokens = sudachi.tokenize(text, MODE)
    words = [m.surface() for m in sudachi_tokens]
    spaces = [False]*len(words)
    return Doc(nlp.vocab, words=words, spaces=spaces)

# nlp.tokenizer = sudachi_tokenizer_func

doc = nlp("今年の干支は庚子です。東京オリンピックたのしみだなあ。")
print([t.text for t in doc])
print([(t.norm_, t.pos_, t.tag_) for t in doc])

['今年', 'の', '干支', 'は', '庚子', 'です', '。', '東京', 'オリンピック', 'たのし', 'み', 'だ', 'なあ', '。']
[('今年', 'NOUN', '名詞-普通名詞-副詞可能'), ('の', 'ADP', '助詞-格助詞'), ('干支', 'NOUN', '名詞-普通名詞-一般'), ('は', 'ADP', '助詞-係助詞'), ('庚子', 'NOUN', '名詞-普通名詞-一般'), ('です', 'AUX', '助動詞'), ('。', 'PUNCT', '補助記号-句点'), ('東京', 'PROPN', '名詞-固有名詞-地名-一般'), ('オリンピック', 'NOUN', '名詞-普通名詞-一般'), ('楽しい', 'ADJ', '形容詞-一般'), ('味', 'PART', '接尾辞-名詞的-一般'), ('だ', 'AUX', '助動詞'), ('な', 'PART', '助詞-終助詞'), ('。', 'PUNCT', '補助記号-句点')]


In [1]:
import spacy
nlp = spacy.load("ja_ginza")
doc = nlp("ひごろ 日ごろ 日頃 呑み 呑んで 飲んで 書きあらわす")
for tok in doc:
    print(tok.text, tok.lemma_)  # lemma_ ~ Sudachi dictionary form

  relative_imports = re.findall("^\s*import\s+\.(\S+)\s*$", content, flags=re.MULTILINE)
  relative_imports += re.findall("^\s*from\s+\.(\S+)\s+import", content, flags=re.MULTILINE)
  imports = re.findall("^\s*import\s+(\S+)\s*$", content, flags=re.MULTILINE)
  imports += re.findall("^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE)


ひごろ ひごろ
日ごろ 日ごろ
日頃 日頃
呑み 呑む
呑ん 呑む
で で
飲ん 飲む
で で
書きあらわす 書きあらわす


In [15]:
import spacy
from spacy.tokens import Doc
from fugashi import Tagger

# Initialize UniDic Tagger
tagger = Tagger()   # fugashi auto-loads UniDic if installed

nlp = spacy.blank("ja")        # blank Japanese pipeline

# Register custom token extensions
from spacy.tokens import Token
Token.set_extension("unidic_lemma", default=None, force=True)
Token.set_extension("unidic_reading", default=None, force=True)
Token.set_extension("unidic_pos", default=None, force=True)
Token.set_extension("unidic_feats", default=None, force=True)

def mecab_tokenizer(text):
    words = []
    spaces = []
    lemmas = []
    analyses = tagger(text)
    for m in analyses:
        surface = m.surface
        words.append(surface)
        spaces.append(False)  # Japanese generally no spaces
    doc = Doc(nlp.vocab, words=words, spaces=spaces)
    # Attach UniDic info
    for tok, m in zip(doc, analyses):
        # m.feature: tuple with UniDic columns. Structure depends on UniDic version.
        # Typical indices (verify!) e.g. lemma at feature[10], reading at feature[9].
        
        feats = m.feature
        # Safer: use fugashi attribute helpers
        tok._.unidic_lemma = m.feature[10]  # or m.feature[10]
        tok._.unidic_reading = m.feature[10]  # unified katakana reading
        tok._.unidic_pos = ",".join(m.feature[:4])  # hierarchical POS tuple
        tok._.unidic_feats = feats
    return doc

nlp.tokenizer = mecab_tokenizer

# (Optionally add your own components after this, e.g. a statistical tagger trained on this segmentation)
doc = nlp("日ごろ ひごろ 日頃 居る いる 書きあらわす")
for t in doc:
    print(t.text, t._.unidic_lemma, t._.unidic_reading, t._.unidic_pos)

日ごろ 日ごろ 日ごろ 名詞,普通名詞,副詞可能,*
ひごろ ひごろ ひごろ 名詞,普通名詞,副詞可能,*
日頃 日頃 日頃 名詞,普通名詞,副詞可能,*
居る 居る 居る 動詞,非自立可能,*,*
いる いる いる 動詞,非自立可能,*,*
書きあらわす 書きあらわす 書きあらわす 動詞,一般,*,*


In [9]:
print(dir(m))

NameError: name 'm' is not defined

In [16]:
import spacy
from fugashi import Tagger

nlp = spacy.load("ja_ginza")
tagger = Tagger()

# Register extension fields (only if not already)
from spacy.tokens import Token
for field in ["unidic_lemma","unidic_reading","unidic_pos"]:
    if not Token.has_extension(field):
        Token.set_extension(field, default=None)

def unicdic_enricher(doc):
    text = doc.text
    # Build a char->token index map (start offsets)
    char2token = {}
    for i, tok in enumerate(doc):
        for pos in range(tok.idx, tok.idx + len(tok.text)):
            char2token.setdefault(pos, i)
    # Collect MeCab tokens with offsets
    cursor = 0
    for m in tagger(text):
        surf = m.surface
        start = text.find(surf, cursor)
        cursor = start + len(surf)
        # Find spaCy token that *starts* here (approx.)
        if start in char2token:
            i = char2token[start]
            # Only annotate if exact surface match (avoid mid-token)
            if doc[i].text.startswith(surf):
                doc[i]._.unidic_lemma = m.dictionary_form
                doc[i]._.unidic_reading = m.reading
                doc[i]._.unidic_pos = ",".join(m.pos)
    return doc

nlp.add_pipe(unicdic_enricher, name="unidic_enricher", last=True)

doc = nlp("日ごろ ひごろ 日頃 居る いる 書きあらわす")
for t in doc:
    print(t.text, t.lemma_, t._.unidic_lemma)

ValueError: [E966] `nlp.add_pipe` now takes the string name of the registered component factory, not a callable component. Expected string, but got <function unicdic_enricher at 0x13ca313a0> (name: 'unidic_enricher').

- If you created your component with `nlp.create_pipe('name')`: remove nlp.create_pipe and call `nlp.add_pipe('name')` instead.

- If you passed in a component like `TextCategorizer()`: call `nlp.add_pipe` with the string name instead, e.g. `nlp.add_pipe('textcat')`.

- If you're using a custom component: Add the decorator `@Language.component` (for function components) or `@Language.factory` (for class components / factories) to your custom component and assign it a name, e.g. `@Language.component('your_name')`. You can then run `nlp.add_pipe('your_name')` to add it to the pipeline.