In [0]:
import urllib.request 
from urllib.parse import quote 
from bs4 import BeautifulSoup
import json
import re
import time

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
import json
with open('/content/gdrive/My Drive/Colab Notebooks/Dictionary Project/freqlist.json') as json_file:
     wordsData = json.load(json_file)
wordList = list(wordsData.keys())
len(wordList)

14143

In [0]:
#The link format for soha Viet-Fran is: http://tratu.soha.vn/dict/vi-fr/C%C3%B3

In [0]:
url = "http://tratu.soha.vn/dict/vn_fr/" + quote("như") #resolving the IRI issue

try:
    source = urllib.request.urlopen(url)
except:
    print( "Link broken")
soup = BeautifulSoup(source, 'lxml')

In [0]:
groups = soup.findAll("div", class_ = "section-h3") #soha groups their translations by POS

In [0]:
containers = soup.findAll("div", class_ = "section-h5")

In [8]:
containers[0].find("h5").find("span").contents[0]

'Comme'

In [9]:
containers[0].find("h5").find("span").contents

['Comme']

In [0]:
#detect whether a tag contains a link, if a link is present, the tag contains a context
def hasLink(tag):
    try:
        subTags = tag.findAll('a')
    except:
        return False
    for subTag in subTags:
        if (subTag.get('href')): #found a link
            return True
    return False

In [11]:
hasLink(containers[0].find("dl").find("dd").find("dl").findAll("dd")[0]) 

True

In [0]:
def resolveItalic(parts): #french words use italic to mark an use of "'"
    parts = [str(part) for part in parts]
    if (len(parts) == 1): #nothing special here, just return the string
        return [item.strip() for item in re.split('[.,;]', str(parts[0]))]
    parts = [part.replace("<i>", "'").replace("</i>", "") for part in parts]
    if (parts[-1] == '\n'): #this is an usage, ignore the last part
        return [item.strip() for item in re.split('[.,;]', "".join(parts[:-1]))]
    #this is a translation
    return [item.strip() for item in re.split('[.,;]', "".join(parts))]

In [13]:
temp = containers[0].find("h5").find("span").contents
temp

['Comme']

In [14]:
resolveItalic(temp)

['Comme']

In [15]:
temp = containers[0].find("dl").find("dd").find("dl").findAll("dd")[1].contents
temp

['ils se ressemblent comme deux gouttes d', <i>eau</i>, '\n']

In [16]:
resolveItalic(temp)

["ils se ressemblent comme deux gouttes d'eau"]

In [0]:
def extractContext(tag):
    parts = list(tag.children)
    parts = filter(lambda x: False if x == ' ' or x == '\n' else True, parts) #filter all the non-words out
    parts = [part.contents[0] for part in parts]
    context = " ".join(parts)
    return context

In [18]:
extractContext(containers[0].find("dl").find("dd").find("dl").findAll("dd")[0])

'Họ giống nhau như đúc'

In [0]:
def findTranslation(word):
    print("Finding translation for " + word)
    url = "http://tratu.soha.vn/dict/vn_fr/" + quote(word) #resolving the IRI issue

    try:
        source = urllib.request.urlopen(url)
    except:
        print("Link broken for " + word)
        return "N/A"

    soup = BeautifulSoup(source, 'lxml')
    
    translations = {}
    numTrans = 0
 
    containers = soup.findAll("div", class_ = "section-h5") #getting all the translations for a word
    if (containers is None):
      print("No translations found for " + word)
      return "N/A"

    for trans in containers: #for each translation in this group
        temp = {} #temp dict to store each translation of a word

        #getting the raw translation
        temp['translation'] = resolveItalic(trans.find("h5").find("span").contents)

        try:
            #retrieving the examples and contexts of this translation
            clarifications = trans.find("dl").find("dd").find("dl").findAll("dd")
            temp2 = {} #another dict to store each examples of a translation
            
            numContext = 0;
            numUsage = 0;
            
            for item in clarifications:
                if (hasLink(item)): #this is a context (context contains hyperlink)
                    temp2['context ' + str(numContext+1)] = extractContext(item)
                    numContext += 1
                else: #this is an usage
                    temp2['usage ' + str(numUsage+1)] = resolveItalic(item.contents)
                    numUsage += 1
            
            temp2['nums context'] = numContext
            temp2['nums usage'] = numUsage

            temp['examples'] = temp2
        except:
            temp['examples'] = "N/A"

        translations[numTrans+1] = temp #adding the translation to the dictionary
        numTrans += 1
            
    #storing the number of translations
    translations['nums'] = numTrans
    return translations        

In [20]:
findTranslation("như")

Finding translation for như


{1: {'examples': {'context 1': 'Họ giống nhau như đúc',
   'context 2': 'Như tôi đã nói với anh',
   'context 3': 'Giàu như anh ta',
   'context 4': 'Như thường lệ',
   'nums context': 4,
   'nums usage': 4,
   'usage 1': ["ils se ressemblent comme deux gouttes d'eau"],
   'usage 2': ["comme je vous l'ai dit"],
   'usage 3': ['riche comme il est'],
   'usage 4': ['comme de coutume', "comme d'habitude"]},
  'translation': ['Comme']},
 2: {'examples': {'context 1': 'Tôi cũng vui mừng về điều đó như anh',
   'nums context': 1,
   'nums usage': 1,
   'usage 1': ["je m'en réjouis autant que vous"]},
  'translation': ['Autant que']},
 3: {'examples': {'context 1': 'Gia súc như chó mèo',
   'nums context': 1,
   'nums usage': 1,
   'usage 1': ['les animaux domestiques tels que le chien', 'le chat']},
  'translation': ['Tel que']},
 4: {'examples': {'context 1': 'Như ông đồng ý tôi sẽ nhường cho ông',
   'nums context': 1,
   'nums usage': 1,
   'usage 1': ["si vous êtes d'accord", 'je vous le

In [21]:
findTranslation("tôi")

Finding translation for tôi


{1: {'examples': {'context 1': 'Làm tôi người khác',
   'nums context': 1,
   'nums usage': 1,
   'usage 1': ["se faire domestique chez quelqu'un",
    "être au service de quelqu'un",
    '']},
  'translation': ['Serviteur', 'domestique', '']},
 2: {'examples': {'context 1': 'Vua và tôi',
   'nums context': 1,
   'nums usage': 1,
   'usage 1': ['le roi et ses sujets', '']},
  'translation': ['Sujet', '']},
 3: {'examples': {'context 1': 'Em tôi tuổi tôi',
   'nums context': 1,
   'nums usage': 1,
   'usage 1': ['', '']},
  'translation': ['Je', 'moi', 'memon', 'ma', 'mes', '']},
 4: {'examples': {'context 1': 'Tôi lưỡi gươm',
   'nums context': 1,
   'nums usage': 1,
   'usage 1': ["��tremper une lame d'épée", '']},
  'translation': ['Tremper']},
 5: {'examples': {'context 1': 'Tôi vôi',
   'context 2': 'Cái tôi',
   'nums context': 2,
   'nums usage': 2,
   'usage 1': ['��éteindre de la chaux', ''],
   'usage 2': ['(triết học) ego']},
  'translation': ['Éteindre']},
 'nums': 5}

In [22]:
findTranslation("đã")

Finding translation for đã


{1: {'examples': 'N/A', 'translation': ['Déjà']},
 2: {'examples': 'N/A', 'translation': ['Non seulement']},
 3: {'examples': 'N/A', 'translation': ['Voilà']},
 4: {'examples': 'N/A', 'translation': ['Si']},
 5: {'examples': 'N/A', 'translation': ['À satiété', "jusqu'à satiété"]},
 6: {'examples': 'N/A', 'translation': ['(ít dùng) guérir']},
 'nums': 6}

In [23]:
dictionary = {}
start_time = time.time()
for word in wordList:
    dictionary[word] = findTranslation(word)
    time.sleep(1) #wait for 1 sec between each query
print("--- %s seconds ---" % (time.time() - start_time))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Finding translation for ựa
Finding translation for linux
Finding translation for xubur
Finding translation for trùy
Finding translation for mởi
Finding translation for lilith
Finding translation for poppca
Finding translation for thượng…
Finding translation for ataxynux
Finding translation for hađex
Finding translation for apolonius
Finding translation for karyny
Finding translation for xextecxi
Finding translation for crixtux
Finding translation for gulo
Finding translation for aaaa
Finding translation for aa
Finding translation for exquilin
Finding translation for pompania
Finding translation for nhừng
Finding translation for popes
Finding translation for ereb
Finding translation for chuỵên
Finding translation for auguxis
Finding translation for irax
Finding translation for xkopax
Finding translation for teirezias
Finding translation for chủ…
Finding translation for khiloniđex
Finding translation for khòm
Finding transl

In [24]:
dictionary

{'thông': {1: {'examples': {'context 1': 'Thông hai lá',
    'context 2': 'Thông ba lá',
    'context 3': 'Thông nhựa',
    'nums context': 3,
    'nums usage': 3,
    'usage 1': ['pin de Merkus'],
    'usage 2': ['pin khasya', ''],
    'usage 3': ['pin résineux']},
   'translation': ['(thực vật học) pin', '']},
  2: {'examples': {'context 1': 'Thông ngôn',
    'context 2': 'Thông ngôn lại',
    'nums context': 2,
    'nums usage': 2,
    'usage 1': ['interprète'],
    'usage 2': ['secrétaire des bureaux de district']},
   'translation': ['(từ cũ', 'nghĩa cũ) secrétaire des bureaux de province']},
  3: {'examples': {'context 1': 'Buồng này thông sang buồng bên cạnh',
    'nums context': 1,
    'nums usage': 1,
    'usage 1': ['��cette chambre communique avec la chambre voisine', '']},
   'translation': ['Communiquer']},
  4: {'examples': {'context 1': 'Thông ống dẫn nước',
    'context 2': 'thông cống',
    'nums context': 2,
    'nums usage': 2,
    'usage 1': ["��désengorger une cond

In [0]:
from google.colab import files
with open('soha VI-FR.json', 'w') as outfile:
    json.dump(dictionary, outfile)
files.download('soha VI-FR.json')