In [1]:
import urllib.request 
from urllib.parse import quote 
from bs4 import BeautifulSoup
import json
import re
import time

In [2]:
with open('./wordList', 'r') as infile:
    wordList = infile.read()

In [3]:
wordList = wordList.split()

In [4]:
wordList

['một',
 'và',
 'của',
 'là',
 'có',
 'không',
 'tôi',
 'người',
 'những',
 'đã',
 'ông',
 'cho',
 'như',
 'ta',
 'trong']

In [5]:
posDict = {} #a dictionary to translate POS words in Vietnamese to English
posDict["Động từ"] = "verb"
posDict["Danh từ"] = "noun"
posDict["Nghĩa chuyên ngành"] = "N/A"
posDict["*"] = "N/A"
posDict["Trạng ngữ"] = "adverb"
posDict["Cảm thán"] = "interjection"
posDict["Phó từ"] = "adverb"
posDict["Từ nối"] = "conjunction"
posDict["Từ đệm"] = "N/A"
posDict["Đại từ"] = "pronoun"

In [6]:
#The link format for soha Viet-Eng is: http://tratu.soha.vn/dict/vn_en/C%C3%B3

In [7]:
url = "http://tratu.soha.vn/dict/vn_en/" + quote("có") #resolving the IRI issue

try:
    source = urllib.request.urlopen(url)
except:
    print( "Link broken")
soup = BeautifulSoup(source, 'lxml')

In [8]:
groups = soup.findAll("div", class_ = "section-h3") #soha groups their translations by POS

In [9]:
groups[0].find("h3").find("span").string #getting the POS from each group

'Động từ'

In [10]:
groups[0].findAll("div", class_ = "section-h5")[0] #translations in each group

<div class="section-h5" id="content-5"><h5> <span class="mw-headline">To be</span></h5>
<dl><dd><dl><dd><strong class="selflink">có</strong> <a href="/dict/vn_en/Ai" title="Ai">ai</a> <a href="/dict/vn_en/H%E1%BB%8Fi" title="Hỏi">hỏi</a> <a href="/dict/vn_en/B%E1%BA%A3o" title="Bảo">bảo</a> <a href="/dict/vn_en/T%C3%B4i" title="Tôi">tôi</a> <a href="/dict/vn_en/%C4%90i" title="Đi">đi</a> <a href="/dict/vn_en/V%E1%BA%AFng" title="Vắng">vắng</a>
</dd><dd>if there is anyone asking for me, tell him I am not at home
</dd><dd><a href="/dict/vn_en/C%C6%A1" title="Cơ">cơ</a> <a href="/dict/vn_en/H%E1%BB%99i" title="Hội">hội</a> <a class="new" href="/index.php?title=Ngh%C3%ACn&amp;dict=vn_en&amp;action=edit" title="Nghìn">nghìn</a> <a href="/dict/vn_en/N%C4%83m" title="Năm">năm</a> <strong class="selflink">có</strong> <a href="/dict/vn_en/M%E1%BB%99t" title="Một">một</a>
</dd><dd>there is such an opportunity once in a thousand years; once in a lifetime
</dd><dd><a href="/dict/vn_en/Ch%C3%BAng" 

In [11]:
groups[0].findAll("div", class_ = "section-h5")[0].find("h5").find("span").string #getting one translation

'To be'

In [12]:
groups[0].findAll("div", class_ = "section-h5")[0].find("dl").find("dd").find("dl").findAll("dd") #getting all examples

[<dd><strong class="selflink">có</strong> <a href="/dict/vn_en/Ai" title="Ai">ai</a> <a href="/dict/vn_en/H%E1%BB%8Fi" title="Hỏi">hỏi</a> <a href="/dict/vn_en/B%E1%BA%A3o" title="Bảo">bảo</a> <a href="/dict/vn_en/T%C3%B4i" title="Tôi">tôi</a> <a href="/dict/vn_en/%C4%90i" title="Đi">đi</a> <a href="/dict/vn_en/V%E1%BA%AFng" title="Vắng">vắng</a>
 </dd>,
 <dd>if there is anyone asking for me, tell him I am not at home
 </dd>,
 <dd><a href="/dict/vn_en/C%C6%A1" title="Cơ">cơ</a> <a href="/dict/vn_en/H%E1%BB%99i" title="Hội">hội</a> <a class="new" href="/index.php?title=Ngh%C3%ACn&amp;dict=vn_en&amp;action=edit" title="Nghìn">nghìn</a> <a href="/dict/vn_en/N%C4%83m" title="Năm">năm</a> <strong class="selflink">có</strong> <a href="/dict/vn_en/M%E1%BB%99t" title="Một">một</a>
 </dd>,
 <dd>there is such an opportunity once in a thousand years; once in a lifetime
 </dd>,
 <dd><a href="/dict/vn_en/Ch%C3%BAng" title="Chúng">chúng</a> <a href="/dict/vn_en/T%C3%B4i" title="Tôi">tôi</a> <a href=

In [13]:
groups[0].findAll("div", class_ = "section-h5")[0].find("dl").find("dd").find("dl").findAll("dd")[0] #a context

<dd><strong class="selflink">có</strong> <a href="/dict/vn_en/Ai" title="Ai">ai</a> <a href="/dict/vn_en/H%E1%BB%8Fi" title="Hỏi">hỏi</a> <a href="/dict/vn_en/B%E1%BA%A3o" title="Bảo">bảo</a> <a href="/dict/vn_en/T%C3%B4i" title="Tôi">tôi</a> <a href="/dict/vn_en/%C4%90i" title="Đi">đi</a> <a href="/dict/vn_en/V%E1%BA%AFng" title="Vắng">vắng</a>
</dd>

In [14]:
def extractContext(tag):
    parts = tag.findAll("a")
    parts = [part.contents[0] for part in parts]
    context = " ".join(parts)
    return context

In [15]:
extractContext(groups[0].findAll("div", class_ = "section-h5")[0].find("dl").find("dd").find("dl").findAll("dd")[0])

'ai hỏi bảo tôi đi vắng'

In [16]:
groups[0].findAll("div", class_ = "section-h5")[0].find("dl").find("dd").find("dl").findAll("dd")[1].string.strip() #an usage

'if there is anyone asking for me, tell him I am not at home'

In [17]:
def getPos(vietPos): #translating the POS in Viet to English
    vietPos = vietPos.strip(".")
    if (vietPos.find("(") != -1): #the vietPos contains the english equivalent
        return vietPos[vietPos.find("(")+1:vietPos.find(")")].strip()
    elif (vietPos in posDict):
        return posDict[vietPos]
    else:
        print("Can not determine POS: " + vietPos)
        return "N/A"

In [18]:
getPos("Động từ")

'verb'

In [19]:
getPos("Phó từ ( affirmative particle)")

'affirmative particle'

In [20]:
def findTranslation(word):
    url = "http://tratu.soha.vn/dict/vn_en/" + quote(word) #resolving the IRI issue

    try:
        source = urllib.request.urlopen(url)
    except:
        print( "Link broken")
    soup = BeautifulSoup(source, 'lxml')
    
    groups = soup.findAll("div", class_ = "section-h3") #soha groups their translations by POS
    
    translations = {}
    numTrans = 0
    
    for group in groups:
        groupPos = getPos(group.find("h3").find("span").string) #getting the POS for this group
        
        containers = group.findAll("div", class_ = "section-h5") #getting all the translations for this group
        
        for trans in containers: #for each translation in this group
            temp = {} #temp dict to store each translation of a word
            
            #getting the raw translation
            temp['translation'] = [item.strip() for item in re.split('[,;]', trans.find("h5").find("span").string)]
            temp['POS'] = groupPos
            
            try:
                #retrieving the examples and contexts of this translation
                clarifications = trans.find("dl").find("dd").find("dl").findAll("dd")
                temp2 = {} #another dict to store each examples of a translation
                numExamples = int(len(clarifications)/2) #each clarification has a context followed by its usage
                temp2["nums"] = numExamples
                
                for i in range(0, numExamples):
                    #if the index is even, then item is context. Otherwise, it is usage
                    temp2['context ' + str(i+1)] = extractContext(clarifications[2*i])
                    temp2['usage ' + str(i+1)] = clarifications[2*i+1].string.strip()
                
                temp['examples'] = temp2
            except:
                temp['examples'] = "N/A"
        
            translations[numTrans+1] = temp #adding the translation to the dictionary
            numTrans += 1
            
    #storing the number of translations
    translations['nums'] = numTrans
    return translations        

In [21]:
findTranslation("có")

{1: {'translation': ['To be'],
  'POS': 'verb',
  'examples': {'nums': 5,
   'context 1': 'ai hỏi bảo tôi đi vắng',
   'usage 1': 'if there is anyone asking for me, tell him I am not at home',
   'context 2': 'cơ hội nghìn năm một',
   'usage 2': 'there is such an opportunity once in a thousand years; once in a lifetime',
   'context 3': 'chúng tôi chỉ ba người tất cả',
   'usage 3': 'there are only three of us in all',
   'context 4': 'lá màu xanh',
   'usage 4': 'leaves are green',
   'context 5': 'tuổi',
   'usage 5': 'to be advanced in years'}},
 2: {'translation': ['To have', 'to own'],
  'POS': 'verb',
  'examples': {'nums': 11,
   'context 1': 'người cày ruộng',
   'usage 1': 'the tillet owns his own land',
   'context 2': 'công dân quyền bầu cử ứng cử',
   'usage 2': 'citizens have the right to vote and to stand for election',
   'context 3': 'nhà',
   'usage 3': 'a "have" family',
   'context 4': 'sách ba chương',
   'usage 4': 'the book has three chapters',
   'context 5': 'g

In [22]:
findTranslation("tôi")

{1: {'translation': ['Cur', 'ass', 'donkey'],
  'POS': 'noun',
  'examples': {'nums': 2,
   'context 1': 'đồ',
   'usage 1': 'what a cur you are!',
   'context 2': 'ngu như',
   'usage 2': 'what an ass!'}},
 2: {'translation': ['Dog'],
  'POS': 'noun',
  'examples': {'nums': 6,
   'context 1': 'già giữ xương',
   'usage 1': 'to be a dog in the manger',
   'context 2': 'treo đầu dê bán thịt',
   'usage 2': 'he cries wine and sells vinegar',
   'context 3': 'cắn áo rách',
   'usage 3': 'hardships never come alone',
   'context 4': 'cùng rứt giậu',
   'usage 4': 'beast at bay bites hard',
   'context 5': 'ngáp phải ruồi',
   'usage 5': 'to get a godsend',
   'context 6': 'như với mèo',
   'usage 6': 'to agree like cats and dogs'}},
 'nums': 2}

In [23]:
dictionary = {}
start_time = time.time()
for word in wordList:
    dictionary[word] = findTranslation(word)
    time.sleep(1) #wait for 1 sec between each query
print("--- %s seconds ---" % (time.time() - start_time))

--- 23.94894790649414 seconds ---


In [24]:
dictionary

{'một': {'nums': 0},
 'và': {'nums': 0},
 'của': {1: {'translation': ['Property', 'belongings', 'given kind of food'],
   'POS': 'noun',
   'examples': {'nums': 1,
    'context 1': 'bảo vệ công',
    'usage 1': 'to protect public property'}},
  2: {'translation': ['Of', 'belong to', 'from'],
   'POS': 'adverb',
   'examples': {'nums': 1,
    'context 1': 'cô ta là bạn tôi',
    'usage 1': 'A girl friend of mine'}},
  'nums': 2},
 'là': {1: {'translation': ['To be.'],
   'POS': 'verb',
   'examples': {'nums': 1,
    'context 1': 'thì giờ tiền bạc',
    'usage 1': 'Time is money.'}},
  2: {'translation': ['Trạng ngữ.'], 'POS': 'verb', 'examples': {'nums': 0}},
  3: {'translation': ['How.'],
   'POS': 'interjection',
   'examples': {'nums': 1,
    'context 1': 'đẹp đẹp',
    'usage 1': 'how beautiful!'}},
  4: {'translation': ['To press iron.'],
   'POS': 'verb',
   'examples': {'nums': 1, 'context 1': 'bàn', 'usage 1': 'an iron.'}},
  'nums': 4},
 'có': {1: {'translation': ['To be'],
   

In [25]:
with open('soha VI-EN.json', 'w') as outfile:
    json.dump(dictionary, outfile)