In [2]:
import spacy
from sudachipy import dictionary, tokenizer
from spacy.tokens import Doc

# 1. Standard pipeline (for reliable POS/DEP):
std_nlp = spacy.load("ja_core_news_sm")

# 2. Alternate segmentation pipeline (no tagging):
sudachi = dictionary.Dictionary().create()
MODE = tokenizer.Tokenizer.SplitMode.A  # A=short, C=long

alt_nlp = spacy.blank("ja")

def sudachi_tokenizer_func(text):
    ms = sudachi.tokenize(text, MODE)
    words = [m.surface() for m in ms]
    spaces = [False]*len(words)
    return Doc(alt_nlp.vocab, words=words, spaces=spaces)

alt_nlp.tokenizer = sudachi_tokenizer_func

text = "彼は昨日本を買って読み始めました。"
doc_std = std_nlp(text)
doc_alt = alt_nlp(text)

print("STD tokens:", [t.text for t in doc_std])
print("ALT tokens (mode A):", [t.text for t in doc_alt])
print("STD POS:", [t.pos_ for t in doc_std])

STD tokens: ['彼', 'は', '昨', '日本', 'を', '買っ', 'て', '読み', '始め', 'まし', 'た', '。']
ALT tokens (mode A): ['彼', 'は', '昨', '日本', 'を', '買っ', 'て', '読み', '始め', 'まし', 'た', '。']
STD POS: ['PRON', 'ADP', 'NOUN', 'PROPN', 'ADP', 'VERB', 'SCONJ', 'VERB', 'VERB', 'AUX', 'AUX', 'PUNCT']


In [6]:
import spacy
from sudachipy import tokenizer, dictionary
from spacy.tokens import Doc

sudachi = dictionary.Dictionary().create()
MODE = tokenizer.Tokenizer.SplitMode.C  # change to A for short proxy

nlp = spacy.blank("ja")

def sudachi_tokenizer_func(text):
    sudachi_tokens = sudachi.tokenize(text, MODE)
    words = [m.surface() for m in sudachi_tokens]
    spaces = [False]*len(words)
    return Doc(nlp.vocab, words=words, spaces=spaces)

# nlp.tokenizer = sudachi_tokenizer_func

doc = nlp("夏が過ぎると御香典の時間。")
print([t.text for t in doc])
print([(t.norm_, t.pos_, t.tag_) for t in doc])

['夏', 'が', '過ぎる', 'と', '御', '香典', 'の', '時間', '。']
[('夏', 'NOUN', '名詞-普通名詞-副詞可能'), ('が', 'ADP', '助詞-格助詞'), ('過ぎる', 'VERB', '動詞-非自立可能'), ('と', 'SCONJ', '助詞-接続助詞'), ('御', 'NOUN', '接頭辞'), ('香典', 'NOUN', '名詞-普通名詞-一般'), ('の', 'ADP', '助詞-格助詞'), ('時間', 'NOUN', '名詞-普通名詞-助数詞可能'), ('。', 'PUNCT', '補助記号-句点')]
