In [1]:
import urllib.request 
from urllib.parse import quote 
from bs4 import BeautifulSoup
import json
import re
import time

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
#getting the already scraped dict
import json
with open('/content/gdrive/My Drive/Colab Notebooks/Dictionary Project/soha VI-EN ver 3.json') as json_file:
     currDict = json.load(json_file)

In [4]:
#reading the word list produced by an crubadan:
with open('/content/gdrive/My Drive/Colab Notebooks/Dictionary Project/anCruVietList') as infile:
  wordsData = infile.readlines()

In [5]:
#generate a list of words from the word list produced by an crubadan
def generateWordList(): 
  temp = []
  for word in wordsData:
    parts = word.strip().split(" ")
    #ignore the frequency
    temp.append(" ".join(parts[1:]))
  return temp

In [6]:
wordList = generateWordList()
len(wordList)

34275

In [7]:
#break the word list into batches of 
batches = [wordList[i:i + 5000] for i in range(0, len(wordList), 5000)] 

In [8]:
for batch in batches:
  print(len(batch))

5000
5000
5000
5000
5000
5000
4275


In [9]:
posDict = {} #a dictionary to translate POS words in Vietnamese to English
posDict["Động từ"] = "verb"
posDict["Danh từ"] = "noun"
posDict["Nghĩa chuyên ngành"] = "jargon"
posDict["*"] = "N/A"
posDict["Trạng ngữ"] = "adverb"
posDict["Cảm thán"] = "interjection"
posDict["Phó từ"] = "adverb"
posDict["Từ nối"] = "conjunction"
posDict["Từ đệm"] = "N/A"
posDict["Đại từ"] = "pronoun"

In [10]:
#The link format for soha Viet-Eng is: http://tratu.soha.vn/dict/vn_en/C%C3%B3

In [11]:
url = "http://tratu.soha.vn/dict/vn_en/" + quote("như") #resolving the IRI issue

try:
    source = urllib.request.urlopen(url)
except:
    print( "Link broken")
soup = BeautifulSoup(source, 'lxml')

In [12]:
groups = soup.findAll("div", class_ = "section-h3") #soha groups their translations by POS

In [13]:
containers = soup.findAll("div", class_ = "section-h5")

In [14]:
containers[0].parent

<div class="section-h2" id="show-alter"><h2> <span class="mw-headline">Thông dụng</span></h2>
<a name="As."></a><div class="section-h5" id="content-5"><h5> <span class="mw-headline">As.</span></h5>
<dl><dd><dl><dd><a href="/dict/vn_en/L%C3%A0m" title="Làm">làm</a> <strong class="selflink">như</strong> <a href="/dict/vn_en/T%C3%B4i" title="Tôi">tôi</a>
</dd><dd>Do as I do.
</dd></dl>
</dd></dl>
<a name="Like."></a></div><div class="section-h5" id="content-5"><h5> <span class="mw-headline">Like.</span></h5>
<dl><dd><dl><dd><a href="/dict/vn_en/T%C3%B4i" title="Tôi">tôi</a> <a href="/dict/vn_en/C%C5%A9ng" title="Cũng">cũng</a> <a href="/dict/vn_en/Ngh%C4%A9" title="Nghĩ">nghĩ</a> <strong class="selflink">như</strong> <a href="/dict/vn_en/Anh" title="Anh">anh</a>
</dd><dd>I think like you.
</dd><dd>_alike; similar to.
</dd><dd><a href="/dict/vn_en/Tr%C6%B0%E1%BB%9Dng" title="Trường">trường</a> <a href="/dict/vn_en/H%E1%BB%A3p" title="Hợp">hợp</a> <a href="/dict/vn_en/Anh" title="Anh">anh</

In [15]:
containers[0].find("dl").find("dd").find("dl").findAll("dd")

[<dd><a href="/dict/vn_en/L%C3%A0m" title="Làm">làm</a> <strong class="selflink">như</strong> <a href="/dict/vn_en/T%C3%B4i" title="Tôi">tôi</a>
 </dd>, <dd>Do as I do.
 </dd>]

In [16]:
#detect whether a tag contains a link, if a link is present, the tag contains a context
def hasLink(tag):
    try:
        subTags = tag.findAll('a')
    except:
        return False
    for subTag in subTags:
        if (subTag.get('href')): #found a link
            return True
    return False

In [17]:
hasLink(containers[0].find("dl").find("dd").find("dl").findAll("dd")[1])

False

In [18]:
def extractContext(tag):
    parts = list(tag.children)
    parts = filter(lambda x: False if x == ' ' or x == '\n' else True, parts) #filter all the non-words out
    parts = [part.contents[0] for part in parts]
    context = " ".join(parts)
    return context

In [19]:
extractContext(containers[0].find("dl").find("dd").find("dl").findAll("dd")[0])

'làm như tôi'

In [20]:
unknownPos = []
def getPos(vietPos): #translating the POS in Viet to English
    vietPos = vietPos.strip(".")
    if (vietPos.find("(") != -1): #the vietPos contains the english equivalent
        return vietPos[vietPos.find("(")+1:vietPos.find(")")].strip()
    elif (vietPos in posDict):
        return posDict[vietPos]
    else:
        unknownPos.append(str(vietPos))
        return "N/A"

In [21]:
getPos("Động từ")

'verb'

In [22]:
getPos("Phó từ ( affirmative particle)")

'affirmative particle'

In [23]:
def resolveItalic(parts): #soha use italic to mark an use of "'"
    parts = [str(part) for part in parts]
    if (len(parts) == 1): #nothing special here, just return the string
        return [item.strip() for item in re.split('[.,;]', str(parts[0]))]
    parts = [part.replace("<i>", "'").replace("</i>", "") for part in parts]
    if (parts[-1] == '\n'): #this is an usage, ignore the last part
        return [item.strip() for item in re.split('[.,;]', "".join(parts[:-1]))]
    #this is a translation
    return [item.strip() for item in re.split('[.,;]', "".join(parts))]

In [24]:
def findTranslation(word):
    print("Finding translation for " + word)
    url = "http://tratu.soha.vn/dict/vn_en/" + quote(word) #resolving the IRI issue

    try:
        source = urllib.request.urlopen(url)
    except:
        print("Link broken for " + word)
        return "N/A"

    soup = BeautifulSoup(source, 'lxml')
    
    translations = {}
    numTrans = 0
 
    containers = soup.findAll("div", class_ = "section-h5") #getting all the translations for a word
    if (containers is None or containers == []):
      print("No translations found for " + word)
      return "N/A"
 
    for trans in containers: #for each translation in this group
        temp = {} #temp dict to store each translation of a word

        #getting the raw translation
        temp['translation'] = resolveItalic(trans.find("h5").find("span").contents)
        
        parentTag = trans.parent
        try:
          if (parentTag.attrs["class"] == "section-h3"): #has POS
            temp['POS'] = getPos(group.find("h3").find("span").string)
          else:
            temp['POS'] = "N/A"
        except: #no POS
            temp['POS'] = "N/A"

        try:
            #retrieving the examples and contexts of this translation
            clarifications = trans.find("dl").find("dd").find("dl").findAll("dd")
            temp2 = {} #another dict to store each examples of a translation
            
            numContext = 0;
            numUsage = 0;
            
            for item in clarifications:
                if (hasLink(item)): #this is a context (context contains hyperlink)
                    temp2['context ' + str(numContext+1)] = extractContext(item)
                    numContext += 1
                else: #this is an usage
                    temp2['usage ' + str(numUsage+1)] = resolveItalic(item.contents)
                    numUsage += 1
            
            temp2['nums context'] = numContext
            temp2['nums usage'] = numUsage

            temp['examples'] = temp2
        except:
            temp['examples'] = "N/A"

        translations[numTrans+1] = temp #adding the translation to the dictionary
        numTrans += 1
            
    #storing the number of translations
    translations['nums'] = numTrans
    return translations        

In [25]:
findTranslation("như")

Finding translation for như


{1: {'POS': 'N/A',
  'examples': {'context 1': 'làm như tôi',
   'nums context': 1,
   'nums usage': 1,
   'usage 1': ['Do as I do', '']},
  'translation': ['As', '']},
 2: {'POS': 'N/A',
  'examples': {'context 1': 'tôi cũng nghĩ như anh',
   'context 2': 'trường hợp anh giống như trường hợp tôi',
   'nums context': 2,
   'nums usage': 3,
   'usage 1': ['I think like you', ''],
   'usage 2': ['_alike', 'similar to', ''],
   'usage 3': ['Your case is similar to mine', '']},
  'translation': ['Like', '']},
 'nums': 2}

In [26]:
findTranslation("hờn dỗi")

Finding translation for hờn dỗi


{1: {'POS': 'N/A', 'examples': 'N/A', 'translation': ['To sulk']}, 'nums': 1}

In [27]:
findTranslation("thư ký")

Finding translation for thư ký


{1: {'POS': 'N/A',
  'examples': {'nums context': 0,
   'nums usage': 1,
   'usage 1': ['secretary', 'clerk']},
  'translation': ['Danh từ']},
 2: {'POS': 'N/A', 'examples': 'N/A', 'translation': ['secretary']},
 3: {'POS': 'N/A', 'examples': 'N/A', 'translation': ['clerk']},
 4: {'POS': 'N/A', 'examples': 'N/A', 'translation': ['secretary']},
 'nums': 4}

In [28]:
findTranslation("tôi")

Finding translation for tôi


{1: {'POS': 'N/A',
  'examples': {'nums context': 0,
   'nums usage': 3,
   'usage 1': ['subject'],
   'usage 2': ['servant'],
   'usage 3': ['self']},
  'translation': ['Danh từ']},
 10: {'POS': 'N/A', 'examples': 'N/A', 'translation': ['arrival']},
 11: {'POS': 'N/A', 'examples': 'N/A', 'translation': ['arrive']},
 12: {'POS': 'N/A', 'examples': 'N/A', 'translation': ['ingoing']},
 13: {'POS': 'N/A', 'examples': 'N/A', 'translation': ['capstan']},
 14: {'POS': 'N/A', 'examples': 'N/A', 'translation': ['drawwork']},
 15: {'POS': 'N/A', 'examples': 'N/A', 'translation': ['gin']},
 16: {'POS': 'N/A', 'examples': 'N/A', 'translation': ['hauler']},
 17: {'POS': 'N/A', 'examples': 'N/A', 'translation': ['jack up']},
 18: {'POS': 'N/A', 'examples': 'N/A', 'translation': ['purchase']},
 19: {'POS': 'N/A', 'examples': 'N/A', 'translation': ['sheave']},
 2: {'POS': 'N/A',
  'examples': {'nums context': 0,
   'nums usage': 2,
   'usage 1': ['I', 'me'],
   'usage 2': ['to temper', 'to slake']},


In [29]:
from google.colab import files
start_time = time.time()

index = 6
batch = batches[index]
#for (index, batch) in enumerate(batches):
dictionary = {}

#making the dictionary for each batch
for word in batch:

    #if the word is not in already in the dict, find the translation
    if word not in currDict:
      dictionary[word] = findTranslation(word)
      time.sleep(1) #wait for 1 sec between each query

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
No translations found for tốc hà
Finding translation for tố quyết
No translations found for tố quyết
Finding translation for tố có
No translations found for tố có
Finding translation for tỉ phú
No translations found for tỉ phú
Finding translation for tế vẫn
No translations found for tế vẫn
Finding translation for tế thần
No translations found for tế thần
Finding translation for tế nào
No translations found for tế nào
Finding translation for tắm chung
No translations found for tắm chung
Finding translation for tắm cho
No translations found for tắm cho
Finding translation for tắc và
No translations found for tắc và
Finding translation for tật hay
No translations found for tật hay
Finding translation for tập tốt
No translations found for tập tốt
Finding translation for tập sách
No translations found for tập sách
Finding translation for tập những
No translations found for tập những
Finding translation for tập làm
No translati

In [30]:
#saving the batch
with open('/content/gdrive/My Drive/Colab Notebooks/Dictionary Project/soha VI-EN ver 4 batch {}.json'.format(index), 'w') as outfile:
  json.dump(dictionary, outfile)
print("--- %s seconds ---" % (time.time() - start_time))

--- 8525.342663288116 seconds ---


In [31]:
len(dictionary)

4156

In [32]:
findTranslation("lm")

Finding translation for lm
No translations found for lm


'N/A'