In [1]:
import urllib.request 
from urllib.parse import quote 
from bs4 import BeautifulSoup
import json
import re
import time

In [2]:
#Source: https://codezup.com/web-scraping-word-meaning-dictionary-python-beautifulsoup/

In [3]:
with open('./wordList', 'r') as infile:
    wordList = infile.read()

In [4]:
wordList = wordList.split()

In [5]:
wordList

['một',
 'và',
 'của',
 'là',
 'có',
 'không',
 'tôi',
 'người',
 'những',
 'đã',
 'ông',
 'cho',
 'như',
 'ta',
 'trong']

In [6]:
#The link format for vdict Viet-Eng is: https://vdict.com/m%E1%BB%99t,2,0,0.html

In [7]:
#Converting IRI to ASCII:
#https://stackoverflow.com/questions/4389572/how-to-fetch-a-non-ascii-url-with-python-urlopen

In [8]:
def findTranslation(word):
    url = "https://vdict.com/" + quote(word) + ",2,0,0.html" #resolving the IRI issue
    
    try: #make sure the word is in the dictionary
        source = urllib.request.urlopen(url)
    except urllib.HTTPError:
        print("Word not found " + word)
        return "N/A"
    
    soup = BeautifulSoup(source, 'lxml')

    translations = {}

    #Getting the first POS tag
    firstPosTag = soup.find("div", class_ = 'phanloai')
    curPos = firstPosTag.string.split(' \xa0')[0]

    #first translation is a tag away from the first POS tag
    curTag = firstPosTag.next_sibling.next_sibling

    #storing the number of translations a word has
    numTrans = 1

    #continue while we are either in a POS tag or a translation tag
    while (curTag != '\n'): 
        if (curTag.attrs["class"] == ["list1"]): #a translation
            temp = {} #temp dict to store each translation of a word

            #getting the raw translation
            temp['translation'] = re.split('[,;]', curTag.find('b').string)
            temp['POS'] = curPos

            #retrieving the examples of each of the translations
            try:
                examples = curTag.findAll("ul", class_= "list2")
                temp2 = {} #another dict to store each examples of a translation

                numExamples = len(list(examples))
                temp2['nums'] = numExamples

                for j in range(0, numExamples):
                    temp2['context ' + str(j+1)] = examples[j].find(class_ = "example-original").string

                    #retrieving the usage of each examples
                    try: 
                        tempString = str(examples[j].find("li")) #extracting the usage
                        temp2['usage ' + str(j+1)] = tempString[tempString.find("<br/>")+5:-5]
                    except:
                        temp2['usage ' + str(j+1)] = "N/A"

                temp['examples'] = temp2
            except:
                temp['examples'] = "N/A"

            translations[numTrans] = temp #adding the translation to the dictionary

            #finding next tag
            numTrans += 1
            curTag = curTag.next_sibling
        else: #a tag
            curPos = curTag.string.split(' \xa0')[0]
            curTag = curTag.next_sibling

    #storing the number of translations
    translations['nums'] = numTrans - 1
    return translations

In [9]:
url = "https://vdict.com/" + quote("có") + ",2,0,0.html" #resolving the IRI issue

try:
    source = urllib.request.urlopen(url)
except:
    print( "Link broken")
soup = BeautifulSoup(source, 'lxml')

In [10]:
pos = [item.string.split(' \xa0')[0] for item in soup.findAll("div", class_ = 'phanloai')]

In [11]:
soup.find("div", class_ = 'phanloai').next_sibling.next_sibling.next_sibling.next_sibling

'\n'

In [12]:
containers = soup.findAll("ul", class_= "list1")[0:len(pos)]
containers

[<ul class="list1"><li><b>To be</b><ul class="list2"><li><span class="example-original">có ai hỏi, bảo tôi đi vắng</span><br/>if there is anyone asking for me, tell him I am not at home</li></ul><ul class="list2"><li><span class="example-original">cơ hội nghìn năm có một</span><br/>there is such an opportunity once in a thousand years; once in a lifetime</li></ul><ul class="list2"><li><span class="example-original">chúng tôi chỉ có ba người tất cả</span><br/>there are only three of us in all</li></ul><ul class="list2"><li><span class="example-original">lá có màu xanh</span><br/>leaves are green</li></ul><ul class="list2"><li><span class="example-original">có tuổi</span><br/>to be advanced in years</li></ul></li></ul>]

In [13]:
meanings = [item.find('b').string for item in containers[0:len(pos)]] #word translation
meanings

['To be']

In [14]:
examples = containers[0].findAll("ul", class_= "list2")
temp = str(examples[1].find("li"))
temp[temp.find("<br/>")+5:-5]

'there is such an opportunity once in a thousand years; once in a lifetime'

In [15]:
dictionary = {}

In [16]:
start_time = time.time()
for word in wordList:
    dictionary[word] = findTranslation(word)
    time.sleep(1) #wait for 1 sec between each query
print("--- %s seconds ---" % (time.time() - start_time))

--- 35.79178166389465 seconds ---


In [17]:
dictionary

{'một': {1: {'translation': ['one'],
   'POS': '',
   'examples': {'nums': 3,
    'context 1': 'từng cái (người) một',
    'usage 1': 'one by one. a; an',
    'context 2': 'một ngày',
    'usage 2': 'a day',
    'context 3': 'một bên',
    'usage 3': 'an one hand'}},
  'nums': 1},
 'và': {1: {'translation': ['and'], 'POS': 'conj', 'examples': {'nums': 0}},
  'nums': 1},
 'của': {1: {'translation': ['property', ' belongings', ' given kind of food'],
   'POS': 'noun',
   'examples': {'nums': 1,
    'context 1': 'bảo vệ của công',
    'usage 1': 'to protect public property'}},
  2: {'translation': ['of', ' belong to', ' from'],
   'POS': 'conj',
   'examples': {'nums': 1,
    'context 1': 'cô ta là bạn của tôi',
    'usage 1': 'A girl friend of mine'}},
  'nums': 2},
 'là': {1: {'translation': ['fine silk'],
   'POS': 'noun',
   'examples': {'nums': 0}},
  2: {'translation': ['to bẹ'],
   'POS': 'verb',
   'examples': {'nums': 1,
    'context 1': 'thì giờ là tiền bạc',
    'usage 1': 'Tim

In [18]:
with open('vdict VI-EN.json', 'w') as outfile:
    json.dump(dictionary, outfile)

In [19]:
findTranslation("có")

{1: {'translation': ['To be'],
  'POS': 'verb',
  'examples': {'nums': 5,
   'context 1': 'có ai hỏi, bảo tôi đi vắng',
   'usage 1': 'if there is anyone asking for me, tell him I am not at home',
   'context 2': 'cơ hội nghìn năm có một',
   'usage 2': 'there is such an opportunity once in a thousand years; once in a lifetime',
   'context 3': 'chúng tôi chỉ có ba người tất cả',
   'usage 3': 'there are only three of us in all',
   'context 4': 'lá có màu xanh',
   'usage 4': 'leaves are green',
   'context 5': 'có tuổi',
   'usage 5': 'to be advanced in years'}},
 2: {'translation': ['To have', ' to own'],
  'POS': 'verb',
  'examples': {'nums': 2,
   'context 1': 'người cày có ruộng',
   'usage 1': 'the tillet owns his own land',
   'context 2': 'công dân có quyền bầu cử ứng cử',
   'usage 2': ''}},
 'nums': 2}

In [20]:
findTranslation("là")

{1: {'translation': ['fine silk'], 'POS': 'noun', 'examples': {'nums': 0}},
 2: {'translation': ['to bẹ'],
  'POS': 'verb',
  'examples': {'nums': 1,
   'context 1': 'thì giờ là tiền bạc',
   'usage 1': 'Time is money'}},
 3: {'translation': ['then'],
  'POS': 'conj',
  'examples': {'nums': 2,
   'context 1': 'cảm thán.',
   'usage 1': 'how',
   'context 2': 'đẹp đẹp là!',
   'usage 2': 'how beautiful!'}},
 4: {'translation': ['to press iron'],
  'POS': 'verb',
  'examples': {'nums': 1, 'context 1': 'bàn là', 'usage 1': 'an iron'}},
 'nums': 4}