### Test of janome  

In [1]:
str1 = '隣のGuestは良く柿食うGuestだ'

#### Basic usage  

In [2]:
from janome.tokenizer import Tokenizer

t = Tokenizer()
for token in t.tokenize(str1):
    print(token)

隣	名詞,一般,*,*,*,*,隣,トナリ,トナリ
の	助詞,連体化,*,*,*,*,の,ノ,ノ
Guest	名詞,一般,*,*,*,*,Guest,*,*
は	助詞,係助詞,*,*,*,*,は,ハ,ワ
良く	形容詞,自立,*,*,形容詞・アウオ段,連用テ接続,良い,ヨク,ヨク
柿	名詞,一般,*,*,*,*,柿,カキ,カキ
食う	動詞,自立,*,*,五段・ワ行促音便,基本形,食う,クウ,クウ
Guest	名詞,一般,*,*,*,*,Guest,*,*
だ	助動詞,*,*,*,特殊・ダ,基本形,だ,ダ,ダ


#### If you want to get a list or a space-separated string  

In [3]:
list_wakati = t.tokenize(str1, wakati=True)
print(list_wakati)
str_wakati = ' '.join(list_wakati)
print(str_wakati)

['隣', 'の', 'Guest', 'は', '良く', '柿', '食う', 'Guest', 'だ']
隣 の Guest は 良く 柿 食う Guest だ


#### Add filters for preprocessing  

In [4]:
from janome.analyzer import Analyzer
from janome.tokenfilter import POSStopFilter, LowerCaseFilter

# Omit 助詞 (postpositional particles) and 助動詞 (auxiliary verbs), lower case conversion
token_filters = [ POSStopFilter(['助詞','助動詞']),
                  LowerCaseFilter(),
                ]
tokenizer = Tokenizer()
analyzer = Analyzer(tokenizer=tokenizer, token_filters=token_filters)
for token in analyzer.analyze(str1):
    print(token)

隣	名詞,一般,*,*,*,*,隣,トナリ,トナリ
guest	名詞,一般,*,*,*,*,guest,*,*
良く	形容詞,自立,*,*,形容詞・アウオ段,連用テ接続,良い,ヨク,ヨク
柿	名詞,一般,*,*,*,*,柿,カキ,カキ
食う	動詞,自立,*,*,五段・ワ行促音便,基本形,食う,クウ,クウ
guest	名詞,一般,*,*,*,*,guest,*,*


#### If you want to get base_form of each word  

In [5]:
list_wakati = [token.base_form for token in analyzer.analyze(str1)]
print(list_wakati)
str_wakati = ' '.join(list_wakati)
print(str_wakati)

['隣', 'guest', '良い', '柿', '食う', 'guest']
隣 guest 良い 柿 食う guest


#### If you want to get a list or a space-separated string  

In [6]:
list_wakati = [token.surface for token in analyzer.analyze(str1)]
print(list_wakati)
str_wakati = ' '.join(list_wakati)
print(str_wakati)

['隣', 'guest', '良く', '柿', '食う', 'guest']
隣 guest 良く 柿 食う guest
