In [None]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import pandas as pd

In [None]:
def get_page_source(url):
    html = requests.get(url, headers = {
        'User-Agent': 'Popular browser\'s user-agent',
    })
    html.encoding = 'utf-8'
    sp = BeautifulSoup(html.text, 'html.parser')

    return sp

def get_word(a_element):
    word = ""
    spans = a_element.findAll('span', attrs={'class':'xlarge text-normal me-4'})
    for s in spans:
        word = s.text
    return word

def get_romaji(a_element):
    word = ""
    spans = a_element.findAll('i', attrs={'class':'text-muted xsmall'})
    for s in spans:
        word = s.text
    return word

def get_explanation(a_element):
    explanation = []
    spans = a_element.findAll('li')
    for s in spans:
        explanation.append(s.text)
    return "; ".join(explanation)

def get_properties(a_element):
    properties = []
    spans = a_element.findAll('span', attrs={'class':'badge'})
    for s in spans:
        properties.append(s.text)
    return "; ".join(properties)

def get_project_word_list(sp):
    urls = []

    spans = sp.findAll('a', attrs={'class':'three-col-url'})
    for s in spans:
        fields = s.text.split("・")
        if len(fields) == 3:
            urls.append([s['href'], fields[1], fields[2]])
    return urls

def get_project_word_name(sp):
    name = ""

    spans = sp.findAll('h1', attrs={'class':'logo'})
    for s in spans:
        name = s.text
    return name

def get_project_example(sp):
    examples = []

    spans = sp.findAll('span', attrs={'class':'standardtext'})
    for s in spans:
        examples.append(s.text)
    return examples


In [None]:
urls = []
url = "https://www.japandict.com/lists/misc/on-mim"
urls.append(url)

words = []

for i in range(2, 120):
    urls.append(f"{url}?page={i}")


for u in tqdm(urls):
    sp = get_page_source(u)

    a_list = sp.findAll('a', attrs={'class':'list-group-item list-group-item-action my-2 mdshadow-1'})

    for a in a_list:
        word = get_word(a)
        romaji = get_romaji(a)
        explanation = get_explanation(a)
        properties = get_properties(a)
        #print (word, romaji, explanation, properties)

        words.append([word, romaji, explanation, properties])


100%|██████████| 119/119 [00:52<00:00,  2.27it/s]


In [None]:
node_df = pd.DataFrame(words, columns =['word', 'romaji', 'explanation', 'properties'])

node_df.to_csv("words.csv", sep=",", index = False)

In [None]:
project_url = "https://onomatoproject.com/list.html"

sp = get_page_source(project_url)

url_packs = get_project_word_list(sp)

In [None]:
words = []

for u in tqdm(url_packs):
    new_url = f"https://onomatoproject.com{u[0]}"

    sp = get_page_source(new_url)

    hirakana = u[1]

    katakana = u[2]

    exmaples = get_project_example(sp)

    words.append([hirakana, katakana, "; ".join(exmaples)])

100%|██████████| 267/267 [01:05<00:00,  4.07it/s]


In [None]:
node_with_example_df = pd.DataFrame(words, columns =['hirakana', 'katakana', 'examples'])

node_with_example_df.to_csv("word_examples.csv", sep=",", index = False)

In [None]:
node_df.head()

Unnamed: 0,word,romaji,explanation,properties
0,パチパチ,pachipachi,"crackling, cracking, snapping, popping, clicki...",popular; onomatopoeic or mimetic word; adverb;...
1,ハラハラ,harahara,"to feel anxious, to feel nervous, to be kept i...",popular; onomatopoeic or mimetic word; noun or...
2,あっさり,assari,"easily, readily, quickly, flatly (refuse); lig...",popular; JLPT N1; onomatopoeic or mimetic word...
3,あべこべ,abekobe,"contrary, opposite, inverse, reverse, back-to-...",popular; JLPT N1; onomatopoeic or mimetic word...
4,あやふや,ayafuya,"uncertain, vague, ambiguous",popular; JLPT N1; onomatopoeic or mimetic word...


In [None]:
inner_join = pd.merge(node_df, 
                      node_with_example_df[["hirakana", "examples"]],
                      left_on ='word',
                      right_on = 'hirakana',
                      how ='left')

inner_join = pd.merge(inner_join, 
                      node_with_example_df[["katakana", "examples"]],
                      left_on ='word',
                      right_on = 'katakana',
                      how ='left')



inner_join['examples'] = inner_join['examples_y'].fillna(inner_join['examples_x'])

inner_join.head()

Unnamed: 0,word,romaji,explanation,properties,hirakana,examples_x,katakana,examples_y,examples
0,パチパチ,pachipachi,"crackling, cracking, snapping, popping, clicki...",popular; onomatopoeic or mimetic word; adverb;...,,,パチパチ,囲炉裏の側にいるとぱちぱちという音が聞こえてくる。; 日本には口にいれるとパチパチとはじける...,囲炉裏の側にいるとぱちぱちという音が聞こえてくる。; 日本には口にいれるとパチパチとはじける...
1,ハラハラ,harahara,"to feel anxious, to feel nervous, to be kept i...",popular; onomatopoeic or mimetic word; noun or...,,,ハラハラ,偉い人が集まるパーティで自分が失礼なことしないかどうかハラハラする。; 紅葉の季節が終わって...,偉い人が集まるパーティで自分が失礼なことしないかどうかハラハラする。; 紅葉の季節が終わって...
2,あっさり,assari,"easily, readily, quickly, flatly (refuse); lig...",popular; JLPT N1; onomatopoeic or mimetic word...,あっさり,このラーメンはすごくあっさりした味。; 白大豆よりこの黒大豆で作られた味噌汁はあっさりしてい...,,,このラーメンはすごくあっさりした味。; 白大豆よりこの黒大豆で作られた味噌汁はあっさりしてい...
3,あべこべ,abekobe,"contrary, opposite, inverse, reverse, back-to-...",popular; JLPT N1; onomatopoeic or mimetic word...,,,,,
4,あやふや,ayafuya,"uncertain, vague, ambiguous",popular; JLPT N1; onomatopoeic or mimetic word...,,,,,


In [None]:

inner_join.drop(["hirakana", "katakana", "examples_x", "examples_y"], inplace=True, axis=1)
inner_join.head()

Unnamed: 0,word,romaji,explanation,properties,examples
0,パチパチ,pachipachi,"crackling, cracking, snapping, popping, clicki...",popular; onomatopoeic or mimetic word; adverb;...,囲炉裏の側にいるとぱちぱちという音が聞こえてくる。; 日本には口にいれるとパチパチとはじける...
1,ハラハラ,harahara,"to feel anxious, to feel nervous, to be kept i...",popular; onomatopoeic or mimetic word; noun or...,偉い人が集まるパーティで自分が失礼なことしないかどうかハラハラする。; 紅葉の季節が終わって...
2,あっさり,assari,"easily, readily, quickly, flatly (refuse); lig...",popular; JLPT N1; onomatopoeic or mimetic word...,このラーメンはすごくあっさりした味。; 白大豆よりこの黒大豆で作られた味噌汁はあっさりしてい...
3,あべこべ,abekobe,"contrary, opposite, inverse, reverse, back-to-...",popular; JLPT N1; onomatopoeic or mimetic word...,
4,あやふや,ayafuya,"uncertain, vague, ambiguous",popular; JLPT N1; onomatopoeic or mimetic word...,


In [None]:
inner_join.to_csv("nodes_with_examples_final.csv", sep=",", index = False)