In [1]:
import mechanize
import http.cookiejar
from bs4 import BeautifulSoup
import re

import pandas as pd
import time

# Functions

In [15]:
def load_browser():
    """Loads a mechanize browser for web scraping"""
    
    # Browser
    br = mechanize.Browser()

    # Cookie Jar
    cj = http.cookiejar.LWPCookieJar()
    br.set_cookiejar(cj)

    # Browser options
    br.set_handle_equiv(True)
    br.set_handle_gzip(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

    br.addheaders = [('User-agent', 'Chrome')]
    
    # Return browser
    return br

def mongol_tol_scrape_page_list(br, url, detail=False):
    """Saves all words from a Mongol Tol Bichig category in a dataframe"""
    
    # Browser for detail
    if detail:
        br_detail = load_browser()
    
    # lists to save data
    word_list = []
    id_list = []
    example_list = []
    
    # Loaded html
    html = br.open(url).read()
    
    # Read html with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Counter checks when we reach an empty page
    counter = 0

    while soup.find_all(class_='col-md-3 col-sm-6 list_ug') != []:
        
        # Time page scrape
        start_time = time.time()
        
        # Load words from page
        for element in soup.find_all(class_='col-md-3 col-sm-6 list_ug'):
            a = element.find('a')
            word_list.append(a.contents[0].strip())
            id_val = re.findall('[0-9]+', a['href'])[0]
            id_list.append(id_val)
            
            if detail:
                print(a.contents[0].strip())
                examples = mongol_tol_read_word_detail(br_detail, id_val)  
                example_list.append(examples)
        
        # Increase counter
        counter += 40
        
        # Select new url
        new_url = url + '/' + str(counter)
        
        if detail:
            print(new_url)
        
        # Loaded html
        html = br.open(new_url).read()
        
        # Read html with Beautiful Soup
        soup = BeautifulSoup(html, 'html.parser')
        
        print(time.time() - start_time)

    # Record data in dataframe
    if detail:
        df = pd.DataFrame({'Mongolian': word_list, 'ID': id_list, 'Examples': example_list})
    else:
        df = pd.DataFrame({'Mongolian': word_list, 'ID': id_list})
    
    # Lowercase and remove duplicates
    df['Mongolian'] = df['Mongolian'].str.lower()
    df = df.drop_duplicates(subset = 'Mongolian')
    
    return df

def mongol_tol_read_word_detail(br, id_val):
    
    # URL to load
    url = 'https://mongoltoli.mn/dictionary/detail/' + str(id_val)
    
    # Loaded html
    html = br.open(url).read()

    # Read html with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Record examples
    examples = []
    for element in soup.find_all('em'):
        if element.a != None:
            examples.append(element.contents[0] + element.find_next(id=re.compile('_more')).get_text())
        else:
            examples.append(element.get_text())
    examples = "\n".join(examples)

    # Download and save audio
    if soup.find('audio') != None:
        br.retrieve(soup.find('audio').source['src'], 'audio/' + str(id_val) + '.mp3')

    return examples

In [3]:
re.findall('[0-9]+', 'https://mongoltoli.mn/dictionary/detail/21247')[0]

'21247'

# Mongol Tol Bichig Complete Word List

In [16]:
# Browser
br = load_browser()

# Letters of Mongolian alphabet
tolgoi = ["А", "Б", "В", "Г", "Д", "Е", "Ё", "Ж",
          "З", "И", "К", "Л", "М", "Н", "О",
          "Ө", "П", "Р", "С", "Т", "У", "Ү", "Ф", 
          "Х", "Ц", "Ч", "Ш", "Э", "Ю", "Я",]

# Dataframe to contain words
df = pd.DataFrame(columns = ['Mongolian', 'ID']) #, 'Examples', 'Audio'])

for letter in tolgoi:
    
    # Display letter
    print('Reading:', letter)
    
    # URL to load
    url = 'https://mongoltoli.mn/dictionary/lists/' + letter
    
    # Record data for all words starting with letter
    df_new = mongol_tol_scrape_page_list(br, url, detail=False)
    
    # Append new words to dataframe
    df = df.append(df_new, ignore_index=True)

Reading: А
2.335726022720337
8.376389503479004
4.541833162307739
3.659167766571045
4.4037275314331055
8.824300527572632
5.396712303161621
10.390805721282959
8.473442554473877
2.49143648147583
1.4246912002563477
1.6433491706848145
1.6313958168029785
1.5975332260131836
2.1347599029541016
2.1405322551727295
2.0531811714172363
1.864008903503418
1.3954205513000488
1.558556079864502
1.6889338493347168
1.5275075435638428
1.6678459644317627
3.078867197036743
1.804239273071289
4.05567193031311
3.105006217956543
2.216625213623047
1.9574363231658936
1.6091783046722412
1.6815447807312012
4.337965965270996
3.0715370178222656
2.284738779067993
1.8635337352752686
1.6226742267608643
1.2634694576263428
1.4897024631500244
1.7642717361450195
3.5386061668395996
2.377815008163452
2.3073956966400146
3.4183249473571777
2.889566659927368
2.7907354831695557
2.6311912536621094
2.520963668823242
1.7540650367736816
5.562427997589111
3.0613832473754883
2.046447277069092
3.1782314777374268
5.742380380630493
3.16412

4.529829740524292
1.800748586654663
2.4937796592712402
1.7260308265686035
3.1651065349578857
4.075904607772827
3.1565134525299072
Reading: Е
2.341736316680908
2.6970648765563965
2.3521549701690674
2.4845497608184814
4.225327730178833
2.11651611328125
3.954261302947998
3.1242594718933105
Reading: Ё
3.740194320678711
4.940136194229126
6.963459491729736
19.08236575126648
10.737847328186035
9.116548776626587
6.68557596206665
6.87392258644104
4.9278600215911865
5.755953311920166
3.8829123973846436
3.0413131713867188
2.3275604248046875
Reading: Ж
3.5835788249969482
1.640928030014038
2.354825258255005
3.2666401863098145
3.0217490196228027
1.7475106716156006
1.3863410949707031
1.9774539470672607
3.6080636978149414
3.6959598064422607
2.806382179260254
2.3954594135284424
2.985938549041748
3.2798852920532227
1.899979829788208
1.367908239364624
1.9742035865783691
2.022235155105591
3.2670114040374756
2.2368485927581787
3.063356399536133
3.204286813735962
2.6415317058563232
2.54388427734375
2.042391

11.095998764038086
16.49231219291687
16.250487565994263
16.366642713546753
16.150954484939575
18.019032955169678
18.827231407165527
16.263646602630615
15.593992948532104
23.094853401184082
21.30053687095642
23.781001091003418
23.870046854019165
24.23265838623047
18.410929203033447
21.392053365707397
21.798019647598267
21.056617736816406
21.898043155670166
26.14407968521118
17.9747953414917
9.986042499542236
7.125764846801758
3.439283609390259
4.351670503616333
3.1944594383239746
3.714495897293091
3.044516086578369
2.869428873062134
4.030513048171997
2.833763360977173
9.649156093597412
4.406612873077393
2.3750319480895996
1.6101069450378418
3.082526206970215
1.7962892055511475
1.7536258697509766
1.5467498302459717
1.8163647651672363
1.7212390899658203
1.631772518157959
4.180779457092285
10.859440088272095
18.786104917526245
12.813517808914185
9.901060104370117
5.125119924545288
2.638803005218506
4.755233526229858
2.947361946105957
3.398104190826416
3.166219711303711
3.9321866035461426
R

2.3642404079437256
2.4258995056152344
2.1525371074676514
1.6166222095489502
2.1440212726593018
3.2143566608428955
4.64192533493042
4.719958066940308
2.43394136428833
2.5514259338378906
3.5548510551452637
4.545988321304321
2.932840347290039
6.222484827041626
3.5710198879241943
2.896510362625122
2.238452911376953
2.535613775253296
1.9138891696929932
3.9490761756896973
2.8833625316619873
1.4001269340515137
1.179744005203247
1.3674888610839844
1.371140956878662
1.4872205257415771
3.5871944427490234
5.861255884170532
4.4970924854278564
1.7909069061279297
1.538761854171753
1.270838975906372
2.57328724861145
3.3110294342041016
2.1288199424743652
2.4552924633026123
2.451669454574585
3.7473974227905273
5.220552444458008
5.721199989318848
3.169614315032959
5.23654317855835
5.275392532348633
1.9555387496948242
Reading: Ч
3.781100034713745
5.974781036376953
8.381313800811768
6.416957378387451
3.811826229095459
3.769521951675415
4.74495005607605
4.655252933502197
2.8663933277130127
2.94625949859619

In [19]:
df.drop('URL', axis=1, inplace=True)

In [20]:
df.to_excel('mongol_tol_bichig.xlsx', index=False)

# Mongol Tol Bichig B (read from individual pages)
Number of words: 141513

URL: https://mongoltoli.mn/dictionary/detail/141513

In [150]:
# URL to load
url = 'https://mongoltoli.mn/dictionary/detail/29680'

# Loaded html
html = br.open(url).read()

# Read html with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')

# Record examples
examples = []
for element in soup.find_all('em'):
    if element.a != None:
        examples.append(element.contents[0] + element.find_next(id=re.compile('_more')).get_text())
    else:
        examples.append(element.get_text())

# Download and save audio
audio = re.findall('[0-9]+', soup.find('audio').source['src'])[0]
br.retrieve(soup.find('audio').source['src'], 'audio/' + audio + '.mp3')

('audio/29680.mp3', <http.client.HTTPMessage at 0x7f9a385c8d90>)

# Sandbox

In [13]:
br = load_browser()
df = mongol_tol_scrape_page_list(br, 'https://mongoltoli.mn/dictionary/lists/Ф', detail=False)

Starting...
First page loaded
Ф
ФАБРИК
ФАЙЛ
ФАКС
ФАКСДАХ
ФАКСИМИЛЕ
ФАКТ
ФАКТУР
ФАКУЛЬТЕТ
ФАНАТ
ФАНАТИК
ФАНЕР
ФАШИЗМ
ФАШИСТ
Феддийн аргамжин цэцэг
ФЕЛЬДМАРШИЛ
ФЕН
ФЕНОЛОГИ
ФЕНОМЕН
ФЕОДАЛ
ФЕОДАЛИЗМ
ФЕРМ
ФЕРМЕР
ФЕСТИВАЛЬ
ФИЗИК
ФИЗИКЧ
ФИЗИОЛОГИ
ФИЗИОЛОГИЧ
ФИЛАРМОН
ФИНАНС
ФИНИК
ФИРМ
ФЛАШ
ФЛОТ
ФЛЮГЕР
ФОКУС
ФОНД
ФОРМ
ФОРМАЛДАХ
ФОРМАЛИЗМ
https://mongoltoli.mn/dictionary/lists/Ф/40
8.653899669647217
ФОРМАЛИН
ФОРМАЛИСТ
ФОРУМ
ФОСФАТ
ФОСФОР
ФОСФОРИТ
ФОСФОРЛОГ
ФОСФОРТ
ФОТО
ФРАКЦ
ФРАКЦЛАХ
ФРОНТ
ФТОР
ФУНКЦ
ФУНТ
ФУТУРИЗМ
ФУТУРИСТ
ФҮЗ
ФҮНЛҮҮ
ФҮНТҮҮЗ
ФҮҮЧҮҮ
https://mongoltoli.mn/dictionary/lists/Ф/80
4.220087051391602


In [14]:
df

Unnamed: 0,Mongolian,ID
0,ф,101979
1,фабрик,101980
2,файл,101997
3,факс,101989
4,факсдах,101991
...,...,...
56,футурист,102070
57,фүз,102071
58,фүнлүү,102072
59,фүнтүүз,102073
