In [1]:
import urllib.request 
from urllib.parse import quote 
from bs4 import BeautifulSoup
import json
import re
import time

In [2]:
with open('./wordList', 'r') as infile:
    wordList = infile.read()

In [3]:
wordList = wordList.split()

In [4]:
wordList

['một',
 'và',
 'của',
 'là',
 'có',
 'không',
 'tôi',
 'người',
 'những',
 'đã',
 'ông',
 'cho',
 'như',
 'ta',
 'trong']

In [5]:
#The link format for soha Viet-Fran is: http://tratu.soha.vn/dict/vi-fr/C%C3%B3

In [6]:
url = "http://tratu.soha.vn/dict/vn_fr/" + quote("như") #resolving the IRI issue

try:
    source = urllib.request.urlopen(url)
except:
    print( "Link broken")
soup = BeautifulSoup(source, 'lxml')

In [7]:
groups = soup.findAll("div", class_ = "section-h3") #soha groups their translations by POS

In [8]:
containers = soup.findAll("div", class_ = "section-h5")

In [9]:
containers[0].find("h5").find("span").contents[0]

'Comme'

In [10]:
containers[0].find("h5").find("span").contents

['Comme']

In [11]:
#detect whether a tag contains a link, if a link is present, the tag contains a context
def hasLink(tag):
    try:
        subTags = tag.findAll('a')
    except:
        return False
    for subTag in subTags:
        if (subTag.get('href')): #found a link
            return True
    return False

In [12]:
hasLink(containers[0].find("dl").find("dd").find("dl").findAll("dd")[0]) 

True

In [13]:
def resolveItalic(parts): #french words use italic to mark an use of "'"
    parts = [str(part) for part in parts]
    if (len(parts) == 1): #nothing special here, just return the string
        return [item.strip() for item in re.split('[.,;]', str(parts[0]))]
    parts = [part.replace("<i>", "'").replace("</i>", "'") for part in parts]
    if (parts[-1] == '\n'): #this is an usage, ignore the last part
        return [item.strip() for item in re.split('[.,;]', "".join(parts[:-1]))]
    #this is a translation
    return [item.strip() for item in re.split('[.,;]', "".join(parts))]

In [14]:
temp = containers[0].find("h5").find("span").contents
temp

['Comme']

In [15]:
resolveItalic(temp)

['Comme']

In [16]:
temp = containers[0].find("dl").find("dd").find("dl").findAll("dd")[1].contents
temp

['ils se ressemblent comme deux gouttes d', <i>eau</i>, '\n']

In [17]:
resolveItalic(temp)

["ils se ressemblent comme deux gouttes d'eau'"]

In [18]:
def extractContext(tag):
    parts = list(tag.children)
    parts = filter(lambda x: False if x == ' ' or x == '\n' else True, parts) #filter all the non-words out
    parts = [part.contents[0] for part in parts]
    context = " ".join(parts)
    return context

In [19]:
extractContext(containers[0].find("dl").find("dd").find("dl").findAll("dd")[0])

'Họ giống nhau như đúc'

In [20]:
def findTranslation(word):
    url = "http://tratu.soha.vn/dict/vn_fr/" + quote(word) #resolving the IRI issue

    try:
        source = urllib.request.urlopen(url)
    except urllib.HTTPError:
        print( "Link broken")
    soup = BeautifulSoup(source, 'lxml')
    
    translations = {}
    numTrans = 0
 
    containers = soup.findAll("div", class_ = "section-h5") #getting all the translations for a word

    for trans in containers: #for each translation in this group
        temp = {} #temp dict to store each translation of a word

        #getting the raw translation
        try:
            temp['translation'] = resolveItalic(trans.find("h5").find("span").contents)
        except:
            print(trans.find("h5"))
            print("Can not get translation for " + word)

        try:
            #retrieving the examples and contexts of this translation
            clarifications = trans.find("dl").find("dd").find("dl").findAll("dd")
            temp2 = {} #another dict to store each examples of a translation
            
            numContext = 0;
            numUsage = 0;
            
            for item in clarifications:
                if (hasLink(item)): #this is a context (context contains hyperlink)
                    temp2['context ' + str(numContext+1)] = extractContext(item)
                    numContext += 1
                else: #this is an usage
                    temp2['usage ' + str(numUsage+1)] = resolveItalic(item.contents)
                    numUsage += 1
            
            temp2['nums context'] = numContext
            temp2['nums usage'] = numUsage

            temp['examples'] = temp2
        except:
            temp['examples'] = "N/A"

        translations[numTrans+1] = temp #adding the translation to the dictionary
        numTrans += 1
            
    #storing the number of translations
    translations['nums'] = numTrans
    return translations        

In [21]:
findTranslation("như")

{1: {'translation': ['Comme'],
  'examples': {'context 1': 'Họ giống nhau như đúc',
   'usage 1': ["ils se ressemblent comme deux gouttes d'eau'"],
   'context 2': 'Như tôi đã nói với anh',
   'usage 2': ["comme je vous l'ai dit'"],
   'context 3': 'Giàu như anh ta',
   'usage 3': ['riche comme il est'],
   'context 4': 'Như thường lệ',
   'usage 4': ['comme de coutume', "comme d'habitude'"],
   'nums context': 4,
   'nums usage': 4}},
 2: {'translation': ['Autant que'],
  'examples': {'context 1': 'Tôi cũng vui mừng về điều đó như anh',
   'usage 1': ["je m'en réjouis autant que vous'"],
   'nums context': 1,
   'nums usage': 1}},
 3: {'translation': ['Tel que'],
  'examples': {'context 1': 'Gia súc như chó mèo',
   'usage 1': ['les animaux domestiques tels que le chien', 'le chat'],
   'nums context': 1,
   'nums usage': 1}},
 4: {'translation': ['Si au cas où'],
  'examples': {'context 1': 'Như ông đồng ý tôi sẽ nhường cho ông',
   'usage 1': ["si vous êtes d'accord", "je vous le cé

In [22]:
findTranslation("tôi")

{1: {'translation': ['Serviteur', 'domestique', ''],
  'examples': {'context 1': 'Làm tôi người khác',
   'usage 1': ["se faire domestique chez quelqu'un",
    "être au service de quelqu'un",
    ''],
   'nums context': 1,
   'nums usage': 1}},
 2: {'translation': ['Sujet', ''],
  'examples': {'context 1': 'Vua và tôi',
   'usage 1': ['le roi et ses sujets', ''],
   'nums context': 1,
   'nums usage': 1}},
 3: {'translation': ['Je', 'moi', 'memon', 'ma', 'mes', ''],
  'examples': {'context 1': 'Em tôi tuổi tôi',
   'usage 1': ['', ''],
   'nums context': 1,
   'nums usage': 1}},
 4: {'translation': ['Tremper'],
  'examples': {'context 1': 'Tôi lưỡi gươm',
   'usage 1': ["��tremper une lame d'épée", ''],
   'nums context': 1,
   'nums usage': 1}},
 5: {'translation': ['Éteindre'],
  'examples': {'context 1': 'Tôi vôi',
   'usage 1': ['��éteindre de la chaux', ''],
   'context 2': 'Cái tôi',
   'usage 2': ['(triết học) ego'],
   'nums context': 2,
   'nums usage': 2}},
 'nums': 5}

In [26]:
findTranslation("đã")

{1: {'translation': ['Déjà'], 'examples': 'N/A'},
 2: {'translation': ['Non seulement'], 'examples': 'N/A'},
 3: {'translation': ['Voilà'], 'examples': 'N/A'},
 4: {'translation': ['Si'], 'examples': 'N/A'},
 5: {'translation': ['À satiété', "jusqu'à satiété'"], 'examples': 'N/A'},
 6: {'translation': ['(ít dùng) guérir'], 'examples': 'N/A'},
 'nums': 6}

In [23]:
dictionary = {}
start_time = time.time()
for word in wordList:
    dictionary[word] = findTranslation(word)
    time.sleep(1) #wait for 1 sec between each query
print("--- %s seconds ---" % (time.time() - start_time))

--- 22.962656259536743 seconds ---


In [24]:
dictionary

{'một': {1: {'translation': ['Un'],
   'examples': {'context 1': 'Một người',
    'usage 1': ['un homme', ''],
    'context 2': 'Một nhà',
    'usage 2': ['une maison', ''],
    'context 3': 'Nước Việt Nam là một',
    'usage 3': ['le Viêtnam est un'],
    'nums context': 3,
    'nums usage': 3}},
  2: {'translation': ['Unique'],
   'examples': {'context 1': 'Con một',
    'usage 1': ['fils unique', ''],
    'context 2': 'Một màu',
    'usage 2': ["d'une seule couleur", 'unicolore'],
    'nums context': 2,
    'nums usage': 2}},
  3: {'translation': ['Pour une seule personne', ''],
   'examples': {'context 1': 'Màn dành cho một người ngủ',
    'usage 1': ['moustiquaire pour une seule personne'],
    'context 2': 'chỉ là một',
    'usage 2': ["c'est tout un", ''],
    'context 3': 'một chín một mười',
    'usage 3': ['équivalent'],
    'context 4': 'một chốn đôi nơi',
    'usage 4': ['ménage à deux foyers', 'famille à deux foyers', ''],
    'context 5': 'một chữ cắn đôi cũng không biết'

In [25]:
with open('soha VI-FR.json', 'w') as outfile:
    json.dump(dictionary, outfile)