In [1]:
import urllib.request 
from urllib.parse import quote 
from bs4 import BeautifulSoup
import json
import re
import time

In [2]:
with open('./wordList', 'r') as infile:
    wordList = infile.read()

In [3]:
wordList = wordList.split()

In [4]:
wordList

['một',
 'và',
 'của',
 'là',
 'có',
 'không',
 'tôi',
 'người',
 'những',
 'đã',
 'ông',
 'cho',
 'như',
 'ta',
 'trong']

In [5]:
posDict = {} #a dictionary to translate POS words in Vietnamese to English
posDict["Động từ"] = "verb"
posDict["Danh từ"] = "noun"
posDict["Nghĩa chuyên ngành"] = "jargon"
posDict["*"] = "N/A"
posDict["Trạng ngữ"] = "adverb"
posDict["Cảm thán"] = "interjection"
posDict["Phó từ"] = "adverb"
posDict["Từ nối"] = "conjunction"
posDict["Từ đệm"] = "N/A"
posDict["Đại từ"] = "pronoun"

In [6]:
#The link format for soha Viet-Eng is: http://tratu.soha.vn/dict/vn_en/C%C3%B3

In [95]:
url = "http://tratu.soha.vn/dict/vn_en/" + quote("như") #resolving the IRI issue

try:
    source = urllib.request.urlopen(url)
except:
    print( "Link broken")
soup = BeautifulSoup(source, 'lxml')

In [96]:
groups = soup.findAll("div", class_ = "section-h3") #soha groups their translations by POS

In [97]:
containers = soup.findAll("div", class_ = "section-h5")

In [98]:
containers[0].parent

<div class="section-h2" id="show-alter"><h2> <span class="mw-headline">Thông dụng</span></h2>
<a name="As."></a><div class="section-h5" id="content-5"><h5> <span class="mw-headline">As.</span></h5>
<dl><dd><dl><dd><a href="/dict/vn_en/L%C3%A0m" title="Làm">làm</a> <strong class="selflink">như</strong> <a href="/dict/vn_en/T%C3%B4i" title="Tôi">tôi</a>
</dd><dd>Do as I do.
</dd></dl>
</dd></dl>
<a name="Like."></a></div><div class="section-h5" id="content-5"><h5> <span class="mw-headline">Like.</span></h5>
<dl><dd><dl><dd><a href="/dict/vn_en/T%C3%B4i" title="Tôi">tôi</a> <a href="/dict/vn_en/C%C5%A9ng" title="Cũng">cũng</a> <a href="/dict/vn_en/Ngh%C4%A9" title="Nghĩ">nghĩ</a> <strong class="selflink">như</strong> <a href="/dict/vn_en/Anh" title="Anh">anh</a>
</dd><dd>I think like you.
</dd><dd>_alike; similar to.
</dd><dd><a href="/dict/vn_en/Tr%C6%B0%E1%BB%9Dng" title="Trường">trường</a> <a href="/dict/vn_en/H%E1%BB%A3p" title="Hợp">hợp</a> <a href="/dict/vn_en/Anh" title="Anh">anh</

In [99]:
containers[0].find("dl").find("dd").find("dl").findAll("dd")

[<dd><a href="/dict/vn_en/L%C3%A0m" title="Làm">làm</a> <strong class="selflink">như</strong> <a href="/dict/vn_en/T%C3%B4i" title="Tôi">tôi</a>
 </dd>,
 <dd>Do as I do.
 </dd>]

In [100]:
#detect whether a tag contains a link, if a link is present, the tag contains a context
def hasLink(tag):
    try:
        subTags = tag.findAll('a')
    except:
        return False
    for subTag in subTags:
        if (subTag.get('href')): #found a link
            return True
    return False

In [101]:
hasLink(containers[0].find("dl").find("dd").find("dl").findAll("dd"))

/dict/vn_en/L%C3%A0m


True

In [128]:
def extractContext(tag):
    parts = list(tag.children)
    parts = filter(lambda x: False if x == ' ' or x == '\n' else True, parts) #filter all the non-words out
    parts = [part.contents[0] for part in parts]
    context = " ".join(parts)
    return context

In [129]:
extractContext(containers[0].find("dl").find("dd").find("dl").findAll("dd")[0])

'làm như tôi'

In [130]:
def getPos(vietPos): #translating the POS in Viet to English
    vietPos = vietPos.strip(".")
    if (vietPos.find("(") != -1): #the vietPos contains the english equivalent
        return vietPos[vietPos.find("(")+1:vietPos.find(")")].strip()
    elif (vietPos in posDict):
        return posDict[vietPos]
    else:
        print("Can not determine POS: " + vietPos)
        return "N/A"

In [131]:
getPos("Động từ")

'verb'

In [132]:
getPos("Phó từ ( affirmative particle)")

'affirmative particle'

In [137]:
def findTranslation(word):
    url = "http://tratu.soha.vn/dict/vn_en/" + quote(word) #resolving the IRI issue

    try:
        source = urllib.request.urlopen(url)
    except urllib.HTTPError:
        print( "Link broken")
    soup = BeautifulSoup(source, 'lxml')
    
    translations = {}
    numTrans = 0
 
    containers = soup.findAll("div", class_ = "section-h5") #getting all the translations for a word

    for trans in containers: #for each translation in this group
        temp = {} #temp dict to store each translation of a word

        #getting the raw translation
        temp['translation'] = [item.strip() for item in re.split('[.,;]', trans.find("h5").find("span").string)]
        
        parentTag = trans.parent
        if (parentTag.attrs["class"] == "section-h3"): #has POS
            temp['POS'] = getPos(group.find("h3").find("span").string)
        else: #no POS
            temp['POS'] = "N/A"

        try:
            #retrieving the examples and contexts of this translation
            clarifications = trans.find("dl").find("dd").find("dl").findAll("dd")
            temp2 = {} #another dict to store each examples of a translation
            print(clarifications)
            
            numContext = 0;
            numUsage = 0;
            
            for item in clarifications:
                if (hasLink(item)): #this is a context (context contains hyperlink)
                    print("Has link " + item)
                    temp2['context ' + str(numContext+1)] = extractContext(item)
                    numContext += 1
                else: #this is an usage
                    temp2['usage ' + str(numUsage+1)] = item.string.strip()
                    numUsage += 1
            
            temp2['nums context'] = numContext
            temp2['nums usage'] = numUsage

            temp['examples'] = temp2
        except:
            temp['examples'] = "N/A"

        translations[numTrans+1] = temp #adding the translation to the dictionary
        numTrans += 1
            
    #storing the number of translations
    translations['nums'] = numTrans
    return translations        

In [138]:
findTranslation("như")

[<dd><a href="/dict/vn_en/L%C3%A0m" title="Làm">làm</a> <strong class="selflink">như</strong> <a href="/dict/vn_en/T%C3%B4i" title="Tôi">tôi</a>
</dd>, <dd>Do as I do.
</dd>]
[<dd><a href="/dict/vn_en/T%C3%B4i" title="Tôi">tôi</a> <a href="/dict/vn_en/C%C5%A9ng" title="Cũng">cũng</a> <a href="/dict/vn_en/Ngh%C4%A9" title="Nghĩ">nghĩ</a> <strong class="selflink">như</strong> <a href="/dict/vn_en/Anh" title="Anh">anh</a>
</dd>, <dd>I think like you.
</dd>, <dd>_alike; similar to.
</dd>, <dd><a href="/dict/vn_en/Tr%C6%B0%E1%BB%9Dng" title="Trường">trường</a> <a href="/dict/vn_en/H%E1%BB%A3p" title="Hợp">hợp</a> <a href="/dict/vn_en/Anh" title="Anh">anh</a> <a href="/dict/vn_en/Gi%E1%BB%91ng" title="Giống">giống</a> <strong class="selflink">như</strong> <a href="/dict/vn_en/Tr%C6%B0%E1%BB%9Dng" title="Trường">trường</a> <a href="/dict/vn_en/H%E1%BB%A3p" title="Hợp">hợp</a> <a href="/dict/vn_en/T%C3%B4i" title="Tôi">tôi</a>
</dd>, <dd>Your case is similar to mine.
</dd>]


{1: {'translation': ['As', ''], 'POS': 'N/A', 'examples': 'N/A'},
 2: {'translation': ['Like', ''], 'POS': 'N/A', 'examples': 'N/A'},
 'nums': 2}

In [88]:
findTranslation("tôi")

[<dd>subject
</dd>, <dd>servant
</dd>, <dd>self
</dd>]
Link does not exist
[<dd>I, me
</dd>, <dd>to temper, to slake
</dd>]
Link does not exist


{1: {'translation': ['Danh từ'],
  'POS': 'N/A',
  'examples': {'nums': 3,
   'usage 1': 'subject',
   'usage 2': 'servant',
   'usage 3': 'self'}},
 2: {'translation': ['Động từ'],
  'POS': 'N/A',
  'examples': {'nums': 2,
   'usage 1': 'I, me',
   'usage 2': 'to temper, to slake'}},
 3: {'translation': ['bad'], 'POS': 'N/A', 'examples': 'N/A'},
 4: {'translation': ['friable'], 'POS': 'N/A', 'examples': 'N/A'},
 5: {'translation': ['chilling'], 'POS': 'N/A', 'examples': 'N/A'},
 6: {'translation': ['cure'], 'POS': 'N/A', 'examples': 'N/A'},
 7: {'translation': ['dip'], 'POS': 'N/A', 'examples': 'N/A'},
 8: {'translation': ['thoriveitite'], 'POS': 'N/A', 'examples': 'N/A'},
 9: {'translation': ['advent'], 'POS': 'N/A', 'examples': 'N/A'},
 10: {'translation': ['arrival'], 'POS': 'N/A', 'examples': 'N/A'},
 11: {'translation': ['arrive'], 'POS': 'N/A', 'examples': 'N/A'},
 12: {'translation': ['ingoing'], 'POS': 'N/A', 'examples': 'N/A'},
 13: {'translation': ['capstan'], 'POS': 'N/A', 

In [25]:
dictionary = {}
start_time = time.time()
for word in wordList:
    dictionary[word] = findTranslation(word)
    time.sleep(1) #wait for 1 sec between each query
print("--- %s seconds ---" % (time.time() - start_time))

--- 23.005154371261597 seconds ---


In [26]:
dictionary

{'một': {1: {'translation': ['One.'],
   'POS': 'N/A',
   'examples': {'nums': 1,
    'context 1': 'từng cái người',
    'usage 1': 'one by one.'}},
  2: {'translation': ['A', 'an.'],
   'POS': 'N/A',
   'examples': {'nums': 2,
    'context 1': 'ngày',
    'usage 1': 'a day.',
    'context 2': 'bên',
    'usage 2': 'an one hand.'}},
  'nums': 2},
 'và': {1: {'translation': ['Trạng ngữ'], 'POS': 'N/A', 'examples': 'N/A'},
  'nums': 1},
 'của': {1: {'translation': ['Property', 'belongings', 'given kind of food'],
   'POS': 'N/A',
   'examples': {'nums': 1,
    'context 1': 'bảo vệ công',
    'usage 1': 'to protect public property'}},
  2: {'translation': ['Of', 'belong to', 'from'],
   'POS': 'N/A',
   'examples': {'nums': 1,
    'context 1': 'cô ta là bạn tôi',
    'usage 1': 'A girl friend of mine'}},
  'nums': 2},
 'là': {1: {'translation': ['Danh từ.'], 'POS': 'N/A', 'examples': 'N/A'},
  2: {'translation': ['To be.'],
   'POS': 'N/A',
   'examples': {'nums': 1,
    'context 1': 'thì

In [27]:
with open('soha VI-EN.json', 'w') as outfile:
    json.dump(dictionary, outfile)