In [0]:
import urllib.request 
from urllib.parse import quote 
from bs4 import BeautifulSoup
import json
import re
import time

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
#Source: https://codezup.com/web-scraping-word-meaning-dictionary-python-beautifulsoup/

In [4]:
import json
with open('/content/gdrive/My Drive/Colab Notebooks/Dictionary Project/freqlist.json') as json_file:
     wordsData = json.load(json_file)
wordList = list(wordsData.keys())
len(wordList)

14143

In [0]:
#The link format for vdict Viet-Eng is: https://vdict.com/m%E1%BB%99t,2,0,0.html

In [0]:
#Converting IRI to ASCII:
#https://stackoverflow.com/questions/4389572/how-to-fetch-a-non-ascii-url-with-python-urlopen

In [0]:
def findTranslation(word):
    print("Finding translation for " + word)
    url = "https://vdict.com/" + quote(word) + ",2,0,0.html" #resolving the IRI issue
    
    try: #make sure the link is working
        source = urllib.request.urlopen(url)
    except:
        print("Link broken for " + word)
        return "N/A"
    
    soup = BeautifulSoup(source, 'lxml')

    #make sure the word is in the dictionary
    if (soup.find("div", id = 'result-contents') is None):
        print("Not in the dictionary " + word)
        return "N/A"
    
    translations = {}

    #Getting the first POS tag
    firstPosTag = soup.find("div", class_ = 'phanloai')
    
    if (firstPosTag is None): #empty translation page
        print("Empty translation page " + word)
        return "N/A"
    
    curPos = firstPosTag.string.split(' \xa0')[0]

    #first translation is a tag away from the first POS tag
    curTag = firstPosTag.next_sibling.next_sibling

    #storing the number of translations a word has
    numTrans = 1

    #continue while we are either in a POS tag or a translation tag
    while (curTag != '\n'): 
        if (curTag.attrs["class"] == ["list1"]): #a translation
            temp = {} #temp dict to store each translation of a word

            #getting the raw translation
            if (curTag.find('b').string is not None): #empty translation cell
                temp['translation'] = re.split('[,;]', curTag.find('b').string)
                temp['POS'] = curPos

                #retrieving the examples of each of the translations
                try:
                    examples = curTag.findAll("ul", class_= "list2")
                    temp2 = {} #another dict to store each examples of a translation

                    numExamples = len(list(examples))
                    temp2['nums'] = numExamples

                    for j in range(0, numExamples):
                        temp2['context ' + str(j+1)] = examples[j].find(class_ = "example-original").string

                        #retrieving the usage of each examples
                        try: 
                            tempString = str(examples[j].find("li")) #extracting the usage
                            temp2['usage ' + str(j+1)] = tempString[tempString.find("<br/>")+5:-5]
                        except:
                            temp2['usage ' + str(j+1)] = "N/A"

                    temp['examples'] = temp2
                except:
                    temp['examples'] = "N/A"

                translations[numTrans] = temp #adding the translation to the dictionary

                numTrans += 1
            curTag = curTag.next_sibling                
        else: #a tag
            curPos = curTag.string.split(' \xa0')[0]
            curTag = curTag.next_sibling

    #storing the number of translations
    translations['nums'] = numTrans - 1
    return translations

In [0]:
url = "https://vdict.com/" + quote("có") + ",2,0,0.html" #resolving the IRI issue

try:
    source = urllib.request.urlopen(url)
except:
    print( "Link broken")
soup = BeautifulSoup(source, 'lxml')

In [9]:
soup.find("div", class_ = 'phanloai').next_sibling.next_sibling.next_sibling.next_sibling

'\n'

In [0]:
dictionary = {}

In [11]:
start_time = time.time()
for word in wordList:
    dictionary[word] = findTranslation(word)
    time.sleep(1) #wait for 1 sec between each query
print("--- %s seconds ---" % (time.time() - start_time))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Not in the dictionary alexandros
Finding translation for euneos
Not in the dictionary euneos
Finding translation for sarpedon
Not in the dictionary sarpedon
Finding translation for aineias
Not in the dictionary aineias
Finding translation for pandaros
Not in the dictionary pandaros
Finding translation for meleagros
Not in the dictionary meleagros
Finding translation for bellerophontes
Not in the dictionary bellerophontes
Finding translation for laomedon
Not in the dictionary laomedon
Finding translation for ida
Not in the dictionary ida
Finding translation for aulis
Not in the dictionary aulis
Finding translation for ổ.
Not in the dictionary ổ.
Finding translation for tóp
Not in the dictionary tóp
Finding translation for tép
Finding translation for ngặm
Not in the dictionary ngặm
Finding translation for hekabe
Not in the dictionary hekabe
Finding translation for thersites
Not in the dictionary thersites
Finding translatio

In [12]:
dictionary

{'thông': {1: {'POS': 'noun',
   'examples': {'nums': 0},
   'translation': ['pine']},
  2: {'POS': 'adj',
   'examples': {'nums': 0},
   'translation': ['through', ' clear', ' unchocked']},
  3: {'POS': 'verb',
   'examples': {'nums': 0},
   'translation': ['to clear off', ' to unchock', ' to unclog to open']},
  4: {'POS': 'verb',
   'examples': {'nums': 0},
   'translation': ['to allow traffic through']},
  'nums': 4},
 'tin': {1: {'POS': 'noun',
   'examples': {'nums': 0},
   'translation': ['news', ' information']},
  2: {'POS': 'verb', 'examples': {'nums': 0}, 'translation': ['to believe']},
  'nums': 2},
 'ebook': 'N/A',
 'tên': {1: {'POS': 'noun',
   'examples': {'nums': 0},
   'translation': ['arrow name chap', 'bloke']},
  'nums': 1},
 'truyện': {1: {'POS': 'noun',
   'examples': {'nums': 0},
   'translation': ['story']},
  'nums': 1},
 'nguyên': 'N/A',
 'tác': 'N/A',
 'tiếng': {1: {'POS': 'noun',
   'examples': {'nums': 0},
   'translation': ['sound', ' voice language name',

In [0]:
from google.colab import files
with open('vdict VI-EN.json', 'w') as outfile:
    json.dump(dictionary, outfile)
files.download('vdict VI-EN.json')

In [0]:
findTranslation("cái")