### Install Mecab + NEologd dict (Run FIRST)

In [1]:
!apt install aptitude swig
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3
!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n -a
!pip install unidic-lite

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  aptitude-common libcgi-fast-perl libcgi-pm-perl libclass-accessor-perl
  libcwidget3v5 libencode-locale-perl libfcgi-perl libhtml-parser-perl
  libhtml-tagset-perl libhttp-date-perl libhttp-message-perl libio-html-perl
  libio-string-perl liblwp-mediatypes-perl libparse-debianchangelog-perl
  libsigc++-2.0-0v5 libsub-name-perl libtimedate-perl liburi-perl libxapian30
  swig3.0
Suggested packages:
  aptitude-doc-en | aptitude-doc apt-xapian-index debtags tasksel
  libcwidget-dev libdata-dump-perl libhtml-template-perl libxml-simple-perl
  libwww-perl xapian-tools swig-doc swig-examples swig3.0-examples swig3.0-doc
The following NEW packages will be installed:
  aptitude aptitude-common libcgi-fast-perl libcgi-pm-perl
  libclass-accessor-perl libcwidget3v5 libencode-locale-perl libfcgi-perl
  libhtml-parser-perl libhtml-tagset-perl libhttp

### Initialize Mecab - NEologd Tagger (better than Mecab alone)

In [2]:
import MeCab
import subprocess

cmd='echo `mecab-config --dicdir`"/mecab-ipadic-neologd"'
path = (subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0]).decode('utf-8')
m=MeCab.Tagger("-d {0}".format(path))

# m = MeCab.Tagger() ### if not use NEologd

In [3]:
print(m.parse("バオ"))
print(m.parse("明治"))
print(m.parse("食べる"))
print(m.parse("ハノイ"))

バオ	名詞,固有名詞,人名,一般,*,*,バオ,バオ,バオ
EOS

明治	名詞,固有名詞,一般,*,*,*,明治,メイジ,メイジ
EOS

食べる	動詞,自立,*,*,一段,基本形,食べる,タベル,タベル
EOS

ハノイ	名詞,固有名詞,地域,一般,*,*,ハノイ,ハノイ,ハノイ
EOS



### Mount Drive (run AFTER install Mecab)

In [4]:
from google.colab import drive            
drive.mount('/content/drive',  force_remount=True)

Mounted at /content/drive


In [5]:
cd /content/drive/My Drive/test

/content/drive/My Drive/test


### Import from file 

List of files

In [6]:
import glob

paths = glob.glob("./text/*/*.txt")

Read from file. 1 file = 1 article

In [9]:
homonymDict = {}
homonymDictNoun = {}
for file in paths:
    with open(file) as f:
        for idx, line in enumerate(f): 
            if idx < 2: continue
            node = m.parseToNode(line)
            while node:
                features = node.feature.split(",")
                #print(node.surface, features)
                part = features[0]
                if len(features) >= 9 and features[6] != "*": ### avoid new words
                    if part in ["名詞", "動詞","形容詞"]: ###only 
                        word = node.surface
                        read = features[8]
                        if part == "名詞" and features[2] != "人名": 
                            if read not in homonymDict:
                                homonymDict[read] = []
                            if word not in homonymDict[read]:
                                homonymDict[read].append(word)
                            if read not in homonymDictNoun:
                                homonymDictNoun[read] = []
                            if word not in homonymDictNoun[read]:
                                homonymDictNoun[read].append(word)
                        elif features[5] in ["連用形", "基本形"]: ### verbs and adj
                            if read not in homonymDict:
                                homonymDict[read] = []
                            if word not in homonymDict[read]:
                                homonymDict[read].append(word)
                node = node.next

In [10]:
homonymDict

{'ドクジョ': ['独女'],
 'ハハ': ['母'],
 'タチ': ['たち', '立ち', '達', '経ち', '断ち', '起ち', '建ち'],
 'サケビ': ['叫び', '叫'],
 'カンレキ': ['還暦'],
 'スギル': ['過ぎる', 'すぎる'],
 'ドーソーカイ': ['同窓会'],
 'ケンコー': ['健康'],
 'マゴ': ['孫'],
 'ワダイ': ['話題'],
 'モリアガル': ['盛り上がる'],
 'サイキン': ['最近', '細菌'],
 'ケッセキ': ['欠席'],
 'シ': ['し',
  '死',
  '士',
  '視',
  '師',
  '市',
  '誌',
  '詩',
  'シ',
  '資',
  '史',
  '紙',
  '子',
  '氏',
  '施',
  '思'],
 'イル': ['いる', '要る', '居る', '射る', 'イル'],
 'ノ': ['の', '野'],
 'ロクジュウサンサイ': ['63歳'],
 'ヒサシブリ': ['久しぶり', '久し振り'],
 'アツマル': ['集まる'],
 'ミンナ': ['みんな'],
 'キンキョー': ['近況'],
 'ホーコク': ['報告'],
 'スル': ['する', 'スル'],
 'スーネンマエ': ['数年前'],
 'ムスメ': ['娘', '娘。'],
 'コト': ['こと', '事', '言', 'コト', '異', '古都'],
 'ハナシ': ['話し', '話', 'ハナシ', '離し', '放し'],
 'ラレ': ['られ'],
 'ウチ': ['うち', '討ち', '内', 'ウチ', '打ち', '撃ち'],
 'ヒトリムスメ': ['ひとり娘', '一人娘'],
 'ライネン': ['来年'],
 'ヨンジッサイ': ['40歳'],
 'ケッコン': ['結婚'],
 'ケハイ': ['気配'],
 'ナイシ': ['ないし'],
 'オトコ': ['男', 'オトコ', 'おとこ', '漢（おとこ）'],
 'ドーキューセイ': ['同級生'],
 'ケイタイ': ['携帯', '形態', 'ケイタイ', '形体'],
 'ジブン': ['自分', '時分

### Remove Hiragana, Katakana, Number, Romaji

In [11]:
def checkHira(c):
    start = 0x3040  # hex literal, gives us a regular integer
    end = 0x309f
    return (ord(c)) in range(start, end + 1)

def checkKata(c):
    start = 0x30a0  # hex literal, gives us a regular integer
    end = 0x30ff
    return (ord(c)) in range(start, end + 1)

def checkHalfFullWidth(c):
    start = 0xff01
    end = 0xffee
    return (ord(c)) in range(start, end + 1)

In [12]:
print(checkHira("を"))
print(checkHira("ヲ"))
print(checkKata("を"))
print(checkKata("ヲ"))
print(checkHalfFullWidth("１"))

True
False
False
True
True


In [13]:
def onlyKanji(dic):
    newDic = {}
    for key, words in dic.items():
        newItems = []
        for item in words:
            if checkHira(item[0]) or checkKata(item[0]):
                continue
            toAdd = True
            for char in item:
                if checkHalfFullWidth(char) or ord(char) < 256:
                    toAdd = False
                    break
            if toAdd:
                newItems.append(item)
        if len(newItems):
            newDic[key] = newItems
    return newDic

In [14]:
homonymDictKanji = onlyKanji(homonymDict)
homonymDictKanji

{'ドクジョ': ['独女'],
 'ハハ': ['母'],
 'タチ': ['立ち', '達', '経ち', '断ち', '起ち', '建ち'],
 'サケビ': ['叫び', '叫'],
 'カンレキ': ['還暦'],
 'スギル': ['過ぎる'],
 'ドーソーカイ': ['同窓会'],
 'ケンコー': ['健康'],
 'マゴ': ['孫'],
 'ワダイ': ['話題'],
 'モリアガル': ['盛り上がる'],
 'サイキン': ['最近', '細菌'],
 'ケッセキ': ['欠席'],
 'シ': ['死', '士', '視', '師', '市', '誌', '詩', '資', '史', '紙', '子', '氏', '施', '思'],
 'イル': ['要る', '居る', '射る'],
 'ノ': ['野'],
 'ヒサシブリ': ['久しぶり', '久し振り'],
 'アツマル': ['集まる'],
 'キンキョー': ['近況'],
 'ホーコク': ['報告'],
 'スーネンマエ': ['数年前'],
 'ムスメ': ['娘', '娘。'],
 'コト': ['事', '言', '異', '古都'],
 'ハナシ': ['話し', '話', '離し', '放し'],
 'ウチ': ['討ち', '内', '打ち', '撃ち'],
 'ヒトリムスメ': ['一人娘'],
 'ライネン': ['来年'],
 'ケッコン': ['結婚'],
 'ケハイ': ['気配'],
 'オトコ': ['男'],
 'ドーキューセイ': ['同級生'],
 'ケイタイ': ['携帯', '形態', '形体'],
 'ジブン': ['自分', '時分'],
 'シャシン': ['写真'],
 'ミセル': ['見せる', '魅せる'],
 'モノ': ['物', '者'],
 'キョ': ['居'],
 'コトシ': ['今年'],
 'ツーチ': ['通知'],
 'ダシ': ['出し', '出汁'],
 'ジマン': ['自慢'],
 'ユージン': ['友人', '有人'],
 'ツライ': ['辛い'],
 'イウ': ['言う', '云う'],
 'セダイ': ['世代'],
 'フタリ': ['二人'],
 'シマイ': ['姉妹', 

In [15]:
homonymDictNounKanji = onlyKanji(homonymDictNoun)
homonymDictNounKanji

{'ドクジョ': ['独女'],
 'ハハ': ['母'],
 'タチ': ['立ち', '達'],
 'サケビ': ['叫び', '叫'],
 'カンレキ': ['還暦'],
 'ドーソーカイ': ['同窓会'],
 'ケンコー': ['健康'],
 'マゴ': ['孫'],
 'ワダイ': ['話題'],
 'サイキン': ['最近', '細菌'],
 'ケッセキ': ['欠席'],
 'ノ': ['野'],
 'ヒサシブリ': ['久しぶり', '久し振り'],
 'キンキョー': ['近況'],
 'ホーコク': ['報告'],
 'スーネンマエ': ['数年前'],
 'ムスメ': ['娘', '娘。'],
 'コト': ['事', '言', '異', '古都'],
 'ウチ': ['内', '打ち'],
 'ヒトリムスメ': ['一人娘'],
 'ライネン': ['来年'],
 'ケッコン': ['結婚'],
 'ケハイ': ['気配'],
 'オトコ': ['男'],
 'ドーキューセイ': ['同級生'],
 'ケイタイ': ['携帯', '形態', '形体'],
 'ジブン': ['自分', '時分'],
 'シャシン': ['写真'],
 'モノ': ['物', '者'],
 'キョ': ['居'],
 'コトシ': ['今年'],
 'ツーチ': ['通知'],
 'ジマン': ['自慢'],
 'ユージン': ['友人', '有人'],
 'セダイ': ['世代'],
 'フタリ': ['二人'],
 'シマイ': ['姉妹', '仕舞'],
 'アネ': ['姉'],
 'ホー': ['方', '頬', '法', '峰', '報', '砲'],
 'イモート': ['妹'],
 'サキ': ['先', '崎'],
 'カイシャイン': ['会社員'],
 'ショクヒ': ['食費'],
 'イエ': ['家'],
 'ゴト': ['事', '毎'],
 'ユーキューキューカ': ['有給休暇'],
 'カイガイ': ['海外'],
 'アソビ': ['遊び'],
 'ソー': ['層', '惣', '創', '葬', '相', '荘', '草', '槽', '僧', '宋'],
 'トシ': ['年', '歳', '都市'],
 'コロ':

### Export

Full

In [16]:
import json

with open('homonymDict.json', 'w') as outfile:
    json.dump(homonymDict, outfile, ensure_ascii=False)
with open('homonymDictNoun.json', 'w') as outfile:
    json.dump(homonymDictNoun, outfile, ensure_ascii=False)

Removed Katakana, Hiragana, Numbers, Romajis

In [17]:
with open('homonymDictKanji.json', 'w') as outfile:
    json.dump(homonymDictKanji, outfile, ensure_ascii=False)
with open('homonymDictNounKanji.json', 'w') as outfile:
    json.dump(homonymDictNounKanji, outfile, ensure_ascii=False)

Infos

In [18]:
print(len(homonymDict))
print(len(homonymDictNoun))
print(len(homonymDictKanji))
print(len(homonymDictNounKanji))

58925
54716
32470
28845


Reload

In [None]:
import json
with open('homonymDictKanji.json', encoding='utf-8') as fh:
    data = json.load(fh)

In [None]:
data

{'ドクジョ': ['独女'],
 'ハハ': ['母'],
 'タチ': ['立ち', '達', '経ち', '断ち', '起ち', '建ち'],
 'サケビ': ['叫び', '叫'],
 'カンレキ': ['還暦'],
 'スギル': ['過ぎる'],
 'ドーソーカイ': ['同窓会'],
 'ケンコー': ['健康'],
 'マゴ': ['孫'],
 'ワダイ': ['話題'],
 'モリアガル': ['盛り上がる'],
 'サイキン': ['最近', '細菌'],
 'ケッセキ': ['欠席'],
 'シ': ['死', '士', '視', '師', '氏', '市', '誌', '詩', '資', '史', '紙', '子', '施', '思'],
 'イル': ['要る', '居る', '射る'],
 'ノ': ['野'],
 'タエ': ['多恵', '耐え', '絶え', '多江'],
 'サン': ['産', '三', '讃', '酸', '参', '山', '賛'],
 'ヒサシブリ': ['久しぶり', '久し振り'],
 'アツマル': ['集まる'],
 'キンキョー': ['近況'],
 'ホーコク': ['報告'],
 'スーネンマエ': ['数年前'],
 'ムスメ': ['娘', '娘。'],
 'コト': ['事', '言', '異', '古都'],
 'ハナシ': ['話し', '話', '離し', '放し'],
 'ウチ': ['討ち', '内', '打ち', '撃ち'],
 'ヒトリムスメ': ['一人娘'],
 'ライネン': ['来年'],
 'ケッコン': ['結婚'],
 'ケハイ': ['気配'],
 'オトコ': ['男'],
 'ドーキューセイ': ['同級生'],
 'ケイタイ': ['携帯', '形態', '形体'],
 'ジブン': ['自分', '時分'],
 'シャシン': ['写真'],
 'ミセル': ['見せる', '魅せる'],
 'モノ': ['物', '者'],
 'キョ': ['居'],
 'コトシ': ['今年'],
 'ツーチ': ['通知'],
 'ダシ': ['出し', '出汁'],
 'ジマン': ['自慢'],
 'ユージン': ['友人', '有人'],
 'ツライ':

### Remove name (already removed on import, no need to run)

In [None]:
def removeNames(dic):
    newDic = {}
    for key, words in dic.items():
        newItems = []
        for item in words:
            if m.parse(item).split(",")[2] == "人名":
                continue
            newItems.append(item)
        if len(newItems):
            newDic[key] = newItems
    return newDic

In [None]:
homonymDictKanjiNotName = removeNames(data)
homonymDictKanjiNotName["ケイコ"]

['稽古']

In [None]:
with open('homonymDictKanjiNotName.json', 'w') as outfile:
    json.dump(homonymDictKanjiNotName, outfile, ensure_ascii=False)