In [1]:
import urllib.request 
from urllib.parse import quote 
from bs4 import BeautifulSoup
import json

In [2]:
#Source: https://codezup.com/web-scraping-word-meaning-dictionary-python-beautifulsoup/

In [3]:
with open('./wordList', 'r') as infile:
    wordList = infile.read()

In [4]:
wordList = wordList.split()

In [5]:
wordList

['một',
 'và',
 'của',
 'là',
 'có',
 'không',
 'tôi',
 'người',
 'những',
 'đã',
 'ông',
 'cho',
 'như',
 'ta',
 'trong']

In [6]:
#The link format for vdict Viet-Eng is: https://vdict.com/m%E1%BB%99t,2,0,0.html

In [7]:
#Converting IRI to ASCII:
#https://stackoverflow.com/questions/4389572/how-to-fetch-a-non-ascii-url-with-python-urlopen

In [8]:
def findTranslation(word):
    url = "https://vdict.com/" + quote(word) + ",2,0,0.html" #resolving the IRI issue

    source = urllib.request.urlopen(url)
    soup = BeautifulSoup(source, 'lxml')
    
    translations = {}
    
    #Getting all the POS of a word
    pos = [item.string.split(' \xa0')[0] for item in soup.findAll("div", class_ = 'phanloai')]
    
    #the number of translations should be equal to the number of TOS 
    translations['nums'] = len(pos)
    
    #find all translations on the site
    containers = soup.findAll("ul", class_= "list1")[0:len(pos)]
    
    for i in range(0, len(pos)):
        temp= {} #temp dict to store each translation of a word
        
        #getting the raw translation
        if (containers[i].find('b').find(',') != -1): #split by ','
            temp['translation'] = containers[i].find('b').string.split(',') 
        else:
            temp['translation'] = containers[i].find('b').string.split(';') 
        
        temp['POS'] = pos[i]
        
        #retrieving the examples of each of the translations
        try:
            examples = containers[i].findAll("ul", class_= "list2")
            temp2 = {} #another dict to store each examples of a translation
            
            numExamples = len(list(examples))
            temp2['nums'] = numExamples
            
            for i in range(0, numExamples):
                temp2['context'] = examples[i].find(class_ = "example-original").string
            
                #retrieving the contexts of each examples
                try: 
                    tempString = str(examples[i].find("li")) #extracting the context
                    temp2['usage ' + str(i+1)] = tempString[tempString.find("<br/>")+5:-5]
                except:
                    temp2['usage ' + str(i+1)] = "N/A"
                
            temp['examples'] = temp2
        except:
            temp['examples'] = "N/A"
        translations[str(i+1)] = temp
        
    return translations

In [9]:
url = "https://vdict.com/" + quote("một") + ",2,0,0.html" #resolving the IRI issue

source = urllib.request.urlopen(url)
soup = BeautifulSoup(source, 'lxml')

In [10]:
pos = [item.string.split(' \xa0')[0] for item in soup.findAll("div", class_ = 'phanloai')]

In [11]:
containers = soup.findAll("ul", class_= "list1")[0:len(pos)]
containers

[<ul class="list1"><li><b>one</b><ul class="list2"><li><span class="example-original">từng cái (người) một</span><br/>one by one. a; an</li></ul><ul class="list2"><li><span class="example-original">một ngày</span><br/>a day</li></ul><ul class="list2"><li><span class="example-original">một bên</span><br/>an one hand</li></ul></li></ul>]

In [12]:
meanings = [item.find('b').string for item in containers[0:len(pos)]] #word translation
meanings

['one']

In [13]:
examples = containers[0].findAll("ul", class_= "list2")
temp = str(examples[1].find("li"))
temp[temp.find("<br/>")+5:-5]

'a day'

In [14]:
dictionary = {word: findTranslation(word) for word in wordList}

In [15]:
dictionary

{'một': {'nums': 1,
  '3': {'translation': ['one'],
   'POS': '',
   'examples': {'nums': 3,
    'context': 'một bên',
    'usage 1': 'one by one. a; an',
    'usage 2': 'a day',
    'usage 3': 'an one hand'}}},
 'và': {'nums': 1,
  '1': {'translation': ['and'], 'POS': 'conj', 'examples': {'nums': 0}}},
 'của': {'nums': 2,
  '1': {'translation': ['of; belong to; from'],
   'POS': 'conj',
   'examples': {'nums': 1,
    'context': 'cô ta là bạn của tôi',
    'usage 1': 'A girl friend of mine'}}},
 'là': {'nums': 4,
  '1': {'translation': ['to press iron'],
   'POS': 'verb',
   'examples': {'nums': 1, 'context': 'bàn là', 'usage 1': 'an iron'}},
  '2': {'translation': ['then'],
   'POS': 'conj',
   'examples': {'nums': 2,
    'context': 'đẹp đẹp là!',
    'usage 1': 'how',
    'usage 2': 'how beautiful!'}}},
 'có': {'nums': 1,
  '5': {'translation': ['To be'],
   'POS': 'verb',
   'examples': {'nums': 5,
    'context': 'có tuổi',
    'usage 1': 'if there is anyone asking for me, tell him 

In [17]:
with open('dictionary.txt', 'w') as outfile:
    json.dump(dictionary, outfile)