In [1]:
import urllib.request 
from urllib.parse import quote 
from bs4 import BeautifulSoup
import json
import re
import time

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
#Source: https://codezup.com/web-scraping-word-meaning-dictionary-python-beautifulsoup/

In [4]:
#getting the already scraped dict
import json
with open('/content/gdrive/My Drive/Colab Notebooks/Dictionary Project/vdict VI-EN ver 3.json') as json_file:
     currDict = json.load(json_file)

In [5]:
#reading the word list produced by an crubadan:
with open('/content/gdrive/My Drive/Colab Notebooks/Dictionary Project/anCruVietList') as infile:
  wordsData = infile.readlines()

In [6]:
wordsData[0:5]

['11362 việt\n', '6895 việt nam\n', '5210 có thể\n', '5153 lý\n', '4515 i\n']

In [7]:
#generate a list of words from the word list produced by an crubadan
def generateWordList(): 
  temp = []
  for word in wordsData:
    parts = word.strip().split(" ")
    #ignore the frequency
    temp.append(" ".join(parts[1:]))
  return temp

In [8]:
wordList = generateWordList()
len(wordList)

34275

In [9]:
#break the word list into batches of 
batches = [wordList[i:i + 10000] for i in range(0, len(wordList), 10000)] 

In [10]:
for batch in batches:
  print(len(batch))

10000
10000
10000
4275


In [11]:
#The link format for vdict Viet-Eng is: https://vdict.com/m%E1%BB%99t,2,0,0.html

In [12]:
#Converting IRI to ASCII:
#https://stackoverflow.com/questions/4389572/how-to-fetch-a-non-ascii-url-with-python-urlopen

In [13]:
def findTranslation(word):
    print("Finding translation for " + word)
    url = "https://vdict.com/" + quote(word) + ",2,0,0.html" #resolving the IRI issue
    
    try: #make sure the link is working
        source = urllib.request.urlopen(url)
    except:
        print("Link broken for " + word)
        return "N/A"
    
    soup = BeautifulSoup(source, 'lxml')

    #make sure the word is in the dictionary
    if (soup.find("div", id = 'result-contents') is None):
        print("Not in the dictionary " + word)
        return "N/A"
    
    translations = {}

    #Getting the first POS tag
    firstPosTag = soup.find("div", class_ = 'phanloai')
    
    if (firstPosTag is None): #empty translation page
        print("Empty translation page " + word)
        return "N/A"
    
    curPos = firstPosTag.string.split(' \xa0')[0]

    #first translation is a tag away from the first POS tag
    curTag = firstPosTag.next_sibling.next_sibling

    #storing the number of translations a word has
    numTrans = 1

    #continue while we are either in a POS tag or a translation tag
    while (curTag != '\n'): 
        if (curTag.attrs["class"] == ["list1"]): #a translation
            temp = {} #temp dict to store each translation of a word

            #getting the raw translation
            if (curTag.find('b').string is not None): #empty translation cell
                temp['translation'] = re.split('[,;]', curTag.find('b').string)
                temp['POS'] = curPos

                #retrieving the examples of each of the translations
                try:
                    examples = curTag.findAll("ul", class_= "list2")
                    temp2 = {} #another dict to store each examples of a translation

                    numExamples = len(list(examples))
                    temp2['nums'] = numExamples

                    for j in range(0, numExamples):
                        temp2['context ' + str(j+1)] = examples[j].find(class_ = "example-original").string

                        #retrieving the usage of each examples
                        try: 
                            tempString = str(examples[j].find("li")) #extracting the usage
                            temp2['usage ' + str(j+1)] = str(examples[j].find("li").contents[-1])
                        except:
                            temp2['usage ' + str(j+1)] = "N/A"

                    temp['examples'] = temp2
                except:
                    temp['examples'] = "N/A"

                translations[numTrans] = temp #adding the translation to the dictionary

                numTrans += 1
            curTag = curTag.next_sibling                
        else: #a tag
            curPos = curTag.string.split(' \xa0')[0]
            curTag = curTag.next_sibling

    #storing the number of translations
    translations['nums'] = numTrans - 1
    return translations

In [14]:
findTranslation("đòn bẩy")

Finding translation for đòn bẩy


{1: {'POS': '',
  'examples': {'context 1': 'Dùng đòn bẩy để bẩy hòn đá',
   'nums': 1,
   'usage 1': 'To lift a stone with a lever'},
  'translation': ['Lever']},
 'nums': 1}

In [15]:
url = "https://vdict.com/" + quote("có") + ",2,0,0.html" #resolving the IRI issue

try:
    source = urllib.request.urlopen(url)
except:
    print( "Link broken")
soup = BeautifulSoup(source, 'lxml')

In [16]:
soup.find("div", class_ = 'phanloai').next_sibling.next_sibling.next_sibling.next_sibling

'\n'

In [17]:
from google.colab import files
start_time = time.time()

index = 3
batch = batches[index] 

#for (index, batch) in enumerate(batches):
dictionary = {}

#making the dictionary for each batch
for word in batch:

    #if the word is not in already in the dict, find the translation
    if word not in currDict:
      dictionary[word] = findTranslation(word)
      time.sleep(1) #wait for 1 sec between each query

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Not in the dictionary tình bằng
Finding translation for tìm lời
Not in the dictionary tìm lời
Finding translation for tìm câu
Not in the dictionary tìm câu
Finding translation for tên tội
Not in the dictionary tên tội
Finding translation for tên gì
Not in the dictionary tên gì
Finding translation for tên cán
Not in the dictionary tên cán
Finding translation for tên bộ
Not in the dictionary tên bộ
Finding translation for tâm với
Not in the dictionary tâm với
Finding translation for tâm sửa
Not in the dictionary tâm sửa
Finding translation for tâm khác
Not in the dictionary tâm khác
Finding translation for tâm bảo
Not in the dictionary tâm bảo
Finding translation for tái phát
Finding translation for tác đầu
Not in the dictionary tác đầu
Finding translation for tác quản
Not in the dictionary tác quản
Finding translation for tác phát
Not in the dictionary tác phát
Finding translation for tác nhạc
Not in the dictionary tác nhạ

In [18]:
#saving the batch
with open('/content/gdrive/My Drive/Colab Notebooks/Dictionary Project/vdict VI-EN ver 4 batch {}.json'.format(index), 'w') as outfile:
  json.dump(dictionary, outfile)

print("--- %s seconds ---" % (time.time() - start_time))

--- 5374.28156375885 seconds ---


In [19]:
len(dictionary.items())

4201

In [20]:
findTranslation("cái")

Finding translation for cái


{1: {'POS': 'noun',
  'examples': {'context 1': 'con dại cái mang',
   'nums': 1,
   'usage 1': 'the mother is responsible for the foolishness of her child'},
  'translation': ['Mother']},
 2: {'POS': 'noun',
  'examples': {'context 1': 'cháu đến rủ cái Hoa đi học',
   'nums': 1,
   'usage 1': "I'm calling for Hoa to go to school with her"},
  'translation': ["khẩu ngữ) (denoting a young girl of one's rank or below)"]},
 3: {'POS': 'noun',
  'examples': {'context 1': 'cái giấm',
   'context 2': 'cái mẻ',
   'context 3': 'Banker (trong đám bạc)',
   'context 4': 'ăn cả nước lẫn cái',
   'nums': 4,
   'usage 1': 'mother of vinegar',
   'usage 2': 'mother of fermented cold rice',
   'usage 3': 'Solid part (of liquid food)',
   'usage 4': 'to eat both the liquid and the solid part (of a food)'},
  'translation': ['Mother', ' mother of vinegar']},
 'nums': 3}