In [1]:
import urllib.request 
from urllib.parse import quote 
from bs4 import BeautifulSoup
import json
import re
import time

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
#getting the already scraped dict
import json
with open('/content/gdrive/My Drive/Colab Notebooks/Dictionary Project/soha VI-FR ver 2.json') as json_file:
     currDict = json.load(json_file)

In [4]:
#reading the word list produced by wikt2dict:
with open('/content/gdrive/My Drive/Colab Notebooks/Dictionary Project/wiktVietList') as infile:
  wordsData = infile.readlines()

In [5]:
#generate a list of words from the word list produced by wikt2dict
def generateWordList(): 
  temp = []
  for word in wordsData:
    word = word.strip()
    temp.append(word)
  return temp

In [6]:
wordList = generateWordList()
len(wordList)

17357

In [7]:
#break the word list into batches of 
batches = [wordList[i:i + 10000] for i in range(0, len(wordList), 10000)] 

In [8]:
for batch in batches:
  print(len(batch))

10000
7357


In [9]:
#The link format for soha Viet-Fran is: http://tratu.soha.vn/dict/vi-fr/C%C3%B3

In [10]:
url = "http://tratu.soha.vn/dict/vn_fr/" + quote("như") #resolving the IRI issue

try:
    source = urllib.request.urlopen(url)
except:
    print( "Link broken")
soup = BeautifulSoup(source, 'lxml')

In [11]:
containers = soup.findAll("div", class_ = "section-h5")
containers

[<div class="section-h5" id="content-5"><h5> <span class="mw-headline">Comme</span></h5>
 <dl><dd><dl><dd><a href="/dict/vn_fr/H%E1%BB%8D" title="Họ">Họ</a> <a href="/dict/vn_fr/Gi%E1%BB%91ng" title="Giống">giống</a> <a href="/dict/vn_fr/Nhau" title="Nhau">nhau</a> <strong class="selflink">như</strong> <a href="/dict/vn_fr/%C4%90%C3%BAc" title="Đúc">đúc</a>
 </dd><dd>ils se ressemblent comme deux gouttes d<i>eau</i>
 </dd><dd><strong class="selflink">Như</strong> <a href="/dict/vn_fr/T%C3%B4i" title="Tôi">tôi</a> <a href="/dict/vn_fr/%C4%90%C3%A3" title="Đã">đã</a> <a href="/dict/vn_fr/N%C3%B3i" title="Nói">nói</a> <a href="/dict/vn_fr/V%E1%BB%9Bi" title="Với">với</a> <a href="/dict/vn_fr/Anh" title="Anh">anh</a>
 </dd><dd>comme je vous l<i>ai dit</i>
 </dd><dd><a href="/dict/vn_fr/Gi%C3%A0u" title="Giàu">Giàu</a> <strong class="selflink">như</strong> <a href="/dict/vn_fr/Anh" title="Anh">anh</a> <a href="/dict/vn_fr/Ta" title="Ta">ta</a>
 </dd><dd>riche comme il est
 </dd><dd><strong 

In [12]:
containers[0].find("h5").find("span").contents[0]

'Comme'

In [13]:
containers[0].find("h5").find("span").contents

['Comme']

In [14]:
#detect whether a tag contains a link, if a link is present, the tag contains a context
def hasLink(tag):
    try:
        subTags = tag.findAll('a')
    except:
        return False
    for subTag in subTags:
        if (subTag.get('href')): #found a link
            return True
    return False

In [15]:
hasLink(containers[0].find("dl").find("dd").find("dl").findAll("dd")[0]) 

True

In [16]:
def resolveItalic(parts): #french words use italic to mark an use of "'"
    parts = [str(part) for part in parts]
    if (len(parts) == 1): #nothing special here, just return the string
        return [item.strip() for item in re.split('[.,;]', str(parts[0]))]
    parts = [part.replace("<i>", "'").replace("</i>", "") for part in parts]
    if (parts[-1] == '\n'): #this is an usage, ignore the last part
        return [item.strip() for item in re.split('[.,;]', "".join(parts[:-1]))]
    #this is a translation
    return [item.strip() for item in re.split('[.,;]', "".join(parts))]

In [17]:
temp = containers[0].find("h5").find("span").contents
temp

['Comme']

In [18]:
resolveItalic(temp)

['Comme']

In [19]:
temp = containers[0].find("dl").find("dd").find("dl").findAll("dd")[1].contents
temp

['ils se ressemblent comme deux gouttes d', <i>eau</i>, '\n']

In [20]:
resolveItalic(temp)

["ils se ressemblent comme deux gouttes d'eau"]

In [21]:
def extractContext(tag):
    parts = list(tag.children)
    parts = filter(lambda x: False if x == ' ' or x == '\n' else True, parts) #filter all the non-words out
    parts = [part.contents[0] for part in parts]
    context = " ".join(parts)
    return context

In [22]:
extractContext(containers[0].find("dl").find("dd").find("dl").findAll("dd")[0])

'Họ giống nhau như đúc'

In [23]:
def findTranslation(word):
    print("Finding translation for " + word)
    url = "http://tratu.soha.vn/dict/vn_fr/" + quote(word) #resolving the IRI issue

    try:
        source = urllib.request.urlopen(url)
    except:
        print("Link broken for " + word)
        return "N/A"

    soup = BeautifulSoup(source, 'lxml')
    
    translations = {}
    numTrans = 0
 
    containers = soup.findAll("div", class_ = "section-h5") #getting all the translations for a word
    if (containers is None or containers == []):
      print("No translations found for " + word)
      return "N/A"

    for trans in containers: #for each translation in this group
        temp = {} #temp dict to store each translation of a word

        #getting the raw translation
        temp['translation'] = resolveItalic(trans.find("h5").find("span").contents)

        try:
            #retrieving the examples and contexts of this translation
            clarifications = trans.find("dl").find("dd").find("dl").findAll("dd")
            temp2 = {} #another dict to store each examples of a translation
            
            numContext = 0;
            numUsage = 0;
            
            for item in clarifications:
                if (hasLink(item)): #this is a context (context contains hyperlink)
                    temp2['context ' + str(numContext+1)] = extractContext(item)
                    numContext += 1
                else: #this is an usage
                    temp2['usage ' + str(numUsage+1)] = resolveItalic(item.contents)
                    numUsage += 1
            
            temp2['nums context'] = numContext
            temp2['nums usage'] = numUsage

            temp['examples'] = temp2
        except:
            temp['examples'] = "N/A"

        translations[numTrans+1] = temp #adding the translation to the dictionary
        numTrans += 1
            
    #storing the number of translations
    translations['nums'] = numTrans
    return translations        

In [24]:
findTranslation("dos")

Finding translation for dos
No translations found for dos


'N/A'

In [25]:
from google.colab import files
start_time = time.time()

index = 1
batch = batches[index] 

#for (index, batch) in enumerate(batches):
dictionary = {}

#making the dictionary for each batch
for word in batch:

    #if the word is not in already in the dict, find the translation
    if word not in currDict:
      dictionary[word] = findTranslation(word)
      time.sleep(1) #wait for 1 sec between each query

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
No translations found for nhà ga hành khách
Finding translation for đáng xấu hổ
No translations found for đáng xấu hổ
Finding translation for xanh lu-xi-a
No translations found for xanh lu-xi-a
Finding translation for lạnh lẽo
Finding translation for bệnh viêm gan
No translations found for bệnh viêm gan
Finding translation for hòm thư đi
No translations found for hòm thư đi
Finding translation for tàu thuỷ
Finding translation for iot
No translations found for iot
Finding translation for bầu vú
No translations found for bầu vú
Finding translation for tín ngưỡng
Finding translation for cơ quan lập pháp
No translations found for cơ quan lập pháp
Finding translation for hiđrô
No translations found for hiđrô
Finding translation for sang năm
Finding translation for ả rập hóa
No translations found for ả rập hóa
Finding translation for chủ nghĩa phát xít
No translations found for chủ nghĩa phát xít
Finding translation for 白血
No t

In [26]:
#saving the batch
with open('/content/gdrive/My Drive/Colab Notebooks/Dictionary Project/soha VI-FR ver 3 batch {}.json'.format(index), 'w') as outfile:
  json.dump(dictionary, outfile)
print("--- %s seconds ---" % (time.time() - start_time))

--- 11472.81164431572 seconds ---


In [27]:
len(dictionary)

6471

In [28]:
findTranslation("insulin")

Finding translation for insulin
No translations found for insulin


'N/A'