In [0]:
import urllib.request 
from urllib.parse import quote 
from bs4 import BeautifulSoup
import json
import re
import time

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import json
with open('/content/gdrive/My Drive/Colab Notebooks/Dictionary Project/bigFreqList.json') as json_file:
     wordsData = json.load(json_file)

In [0]:
#generate a list of words with frequency higher than the threshold
def generateWordList(threshold): 
  temp = []
  for (word, freq) in wordsData.items():
    if (freq >= threshold):
      temp.append(word)
  return temp

In [5]:
wordList = generateWordList(40)
print(wordList[-10:])
len(wordList)

['latikivn', 'imusicvn', 'ycđt', 'greatland', 'lemmevn', 'hoclamgiau', 'hpschoolvn', 'khochiasevn', 'hakiba', 'heisvn']


141836

In [0]:
#break the word list into batches of 
batches = [wordList[i:i + 10000] for i in range(0, len(wordList), 10000)] 

In [7]:
for batch in batches:
  print(len(batch))

10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
1836


In [0]:
#The link format for soha Viet-Fran is: http://tratu.soha.vn/dict/vi-fr/C%C3%B3

In [0]:
url = "http://tratu.soha.vn/dict/vn_fr/" + quote("như") #resolving the IRI issue

try:
    source = urllib.request.urlopen(url)
except:
    print( "Link broken")
soup = BeautifulSoup(source, 'lxml')

In [0]:
groups = soup.findAll("div", class_ = "section-h3") #soha groups their translations by POS

In [0]:
containers = soup.findAll("div", class_ = "section-h5")

In [12]:
containers[0].find("h5").find("span").contents[0]

'Comme'

In [13]:
containers[0].find("h5").find("span").contents

['Comme']

In [0]:
#detect whether a tag contains a link, if a link is present, the tag contains a context
def hasLink(tag):
    try:
        subTags = tag.findAll('a')
    except:
        return False
    for subTag in subTags:
        if (subTag.get('href')): #found a link
            return True
    return False

In [15]:
hasLink(containers[0].find("dl").find("dd").find("dl").findAll("dd")[0]) 

True

In [0]:
def resolveItalic(parts): #french words use italic to mark an use of "'"
    parts = [str(part) for part in parts]
    if (len(parts) == 1): #nothing special here, just return the string
        return [item.strip() for item in re.split('[.,;]', str(parts[0]))]
    parts = [part.replace("<i>", "'").replace("</i>", "") for part in parts]
    if (parts[-1] == '\n'): #this is an usage, ignore the last part
        return [item.strip() for item in re.split('[.,;]', "".join(parts[:-1]))]
    #this is a translation
    return [item.strip() for item in re.split('[.,;]', "".join(parts))]

In [17]:
temp = containers[0].find("h5").find("span").contents
temp

['Comme']

In [18]:
resolveItalic(temp)

['Comme']

In [19]:
temp = containers[0].find("dl").find("dd").find("dl").findAll("dd")[1].contents
temp

['ils se ressemblent comme deux gouttes d', <i>eau</i>, '\n']

In [20]:
resolveItalic(temp)

["ils se ressemblent comme deux gouttes d'eau"]

In [0]:
def extractContext(tag):
    parts = list(tag.children)
    parts = filter(lambda x: False if x == ' ' or x == '\n' else True, parts) #filter all the non-words out
    parts = [part.contents[0] for part in parts]
    context = " ".join(parts)
    return context

In [22]:
extractContext(containers[0].find("dl").find("dd").find("dl").findAll("dd")[0])

'Họ giống nhau như đúc'

In [0]:
def findTranslation(word):
    print("Finding translation for " + word)
    url = "http://tratu.soha.vn/dict/vn_fr/" + quote(word) #resolving the IRI issue

    try:
        source = urllib.request.urlopen(url)
    except:
        print("Link broken for " + word)
        return "N/A"

    soup = BeautifulSoup(source, 'lxml')
    
    translations = {}
    numTrans = 0
 
    containers = soup.findAll("div", class_ = "section-h5") #getting all the translations for a word
    if (containers is None):
      print("No translations found for " + word)
      return "N/A"

    for trans in containers: #for each translation in this group
        temp = {} #temp dict to store each translation of a word

        #getting the raw translation
        temp['translation'] = resolveItalic(trans.find("h5").find("span").contents)

        try:
            #retrieving the examples and contexts of this translation
            clarifications = trans.find("dl").find("dd").find("dl").findAll("dd")
            temp2 = {} #another dict to store each examples of a translation
            
            numContext = 0;
            numUsage = 0;
            
            for item in clarifications:
                if (hasLink(item)): #this is a context (context contains hyperlink)
                    temp2['context ' + str(numContext+1)] = extractContext(item)
                    numContext += 1
                else: #this is an usage
                    temp2['usage ' + str(numUsage+1)] = resolveItalic(item.contents)
                    numUsage += 1
            
            temp2['nums context'] = numContext
            temp2['nums usage'] = numUsage

            temp['examples'] = temp2
        except:
            temp['examples'] = "N/A"

        translations[numTrans+1] = temp #adding the translation to the dictionary
        numTrans += 1
            
    #storing the number of translations
    translations['nums'] = numTrans
    return translations        

In [24]:
findTranslation("như")

Finding translation for như


{1: {'examples': {'context 1': 'Họ giống nhau như đúc',
   'context 2': 'Như tôi đã nói với anh',
   'context 3': 'Giàu như anh ta',
   'context 4': 'Như thường lệ',
   'nums context': 4,
   'nums usage': 4,
   'usage 1': ["ils se ressemblent comme deux gouttes d'eau"],
   'usage 2': ["comme je vous l'ai dit"],
   'usage 3': ['riche comme il est'],
   'usage 4': ['comme de coutume', "comme d'habitude"]},
  'translation': ['Comme']},
 2: {'examples': {'context 1': 'Tôi cũng vui mừng về điều đó như anh',
   'nums context': 1,
   'nums usage': 1,
   'usage 1': ["je m'en réjouis autant que vous"]},
  'translation': ['Autant que']},
 3: {'examples': {'context 1': 'Gia súc như chó mèo',
   'nums context': 1,
   'nums usage': 1,
   'usage 1': ['les animaux domestiques tels que le chien', 'le chat']},
  'translation': ['Tel que']},
 4: {'examples': {'context 1': 'Như ông đồng ý tôi sẽ nhường cho ông',
   'nums context': 1,
   'nums usage': 1,
   'usage 1': ["si vous êtes d'accord", 'je vous le

In [25]:
findTranslation("tôi")

Finding translation for tôi


{1: {'examples': {'context 1': 'Làm tôi người khác',
   'nums context': 1,
   'nums usage': 1,
   'usage 1': ["se faire domestique chez quelqu'un",
    "être au service de quelqu'un",
    '']},
  'translation': ['Serviteur', 'domestique', '']},
 2: {'examples': {'context 1': 'Vua và tôi',
   'nums context': 1,
   'nums usage': 1,
   'usage 1': ['le roi et ses sujets', '']},
  'translation': ['Sujet', '']},
 3: {'examples': {'context 1': 'Em tôi tuổi tôi',
   'nums context': 1,
   'nums usage': 1,
   'usage 1': ['', '']},
  'translation': ['Je', 'moi', 'memon', 'ma', 'mes', '']},
 4: {'examples': {'context 1': 'Tôi lưỡi gươm',
   'nums context': 1,
   'nums usage': 1,
   'usage 1': ["��tremper une lame d'épée", '']},
  'translation': ['Tremper']},
 5: {'examples': {'context 1': 'Tôi vôi',
   'context 2': 'Cái tôi',
   'nums context': 2,
   'nums usage': 2,
   'usage 1': ['��éteindre de la chaux', ''],
   'usage 2': ['(triết học) ego']},
  'translation': ['Éteindre']},
 'nums': 5}

In [26]:
findTranslation("đã")

Finding translation for đã


{1: {'examples': 'N/A', 'translation': ['Déjà']},
 2: {'examples': 'N/A', 'translation': ['Non seulement']},
 3: {'examples': 'N/A', 'translation': ['Voilà']},
 4: {'examples': 'N/A', 'translation': ['Si']},
 5: {'examples': 'N/A', 'translation': ['À satiété', "jusqu'à satiété"]},
 6: {'examples': 'N/A', 'translation': ['(ít dùng) guérir']},
 'nums': 6}

In [0]:
from google.colab import files
start_time = time.time()

index = 2
batch = batches[index]

dictionary = {}

#making the dictionary for each batch
for word in batch:
    dictionary[word] = findTranslation(word)
    time.sleep(1) #wait for 1 sec between each query

Finding translation for đtht
Finding translation for writers
Finding translation for waxing
Finding translation for tếthương
Finding translation for thángbếp
Finding translation for through
Finding translation for soóc
Finding translation for rùm
Finding translation for rupiah
Finding translation for polysaccharide
Finding translation for nấuxuất
Finding translation for nghệm
Finding translation for nahnh
Finding translation for lỗng
Finding translation for login
Finding translation for kqxsmn
Finding translation for know
Finding translation for khôngdu
Finding translation for kháchphòng
Finding translation for khácmức
Finding translation for kheotaycomvn
Finding translation for imagerunner
Finding translation for httpwwwketsatcaocapvn
Finding translation for hiết
Finding translation for hihi
Finding translation for f15
Finding translation for dẽo
Finding translation for dơn
Finding translation for deg
Finding translation for côt
Finding translation for côngđây
Finding translation for 

In [0]:
#saving the batch
with open('/content/gdrive/My Drive/Colab Notebooks/Dictionary Project/soha VI-FR batch {}.json'.format(index), 'w') as outfile:
  json.dump(dictionary, outfile)
print("--- %s seconds ---" % (time.time() - start_time))

In [0]:
dictionary

In [0]:
findTranslation("insulin")