In [1]:
import urllib.request 
from urllib.parse import quote 
from bs4 import BeautifulSoup
import json
import re
import time

In [2]:
#Source: https://codezup.com/web-scraping-word-meaning-dictionary-python-beautifulsoup/

In [3]:
with open('./vi-words.txt', 'r') as infile:
    lines = infile.readlines()
lines = [line.strip().split(" ") for line in lines]
wordList = []
for item in lines:
    if (int(item[1]) >= 2): #words that appeared more than twice is stored
        wordList.append(str(item[0]))
wordList[0:5]

['của', 'và', 'là', 'có', 'không']

In [4]:
#The link format for vdict Viet-Eng is: https://vdict.com/m%E1%BB%99t,2,0,0.html

In [5]:
#Converting IRI to ASCII:
#https://stackoverflow.com/questions/4389572/how-to-fetch-a-non-ascii-url-with-python-urlopen

In [6]:
def findTranslation(word):
    print("Finding translation for " + word)
    url = "https://vdict.com/" + quote(word) + ",2,0,0.html" #resolving the IRI issue
    
    try: #make sure the link is working
        source = urllib.request.urlopen(url)
    except:
        print("Link broken for " + word)
        return "N/A"
    
    soup = BeautifulSoup(source, 'lxml')

    #make sure the word is in the dictionary
    if (soup.find("div", id = 'result-contents') is None):
        print("Not in the dictionary " + word)
        return "N/A"
    
    translations = {}

    #Getting the first POS tag
    firstPosTag = soup.find("div", class_ = 'phanloai')
    
    if (firstPosTag is None): #empty translation page
        print("Empty translation page " + word)
        return "N/A"
    
    curPos = firstPosTag.string.split(' \xa0')[0]

    #first translation is a tag away from the first POS tag
    curTag = firstPosTag.next_sibling.next_sibling

    #storing the number of translations a word has
    numTrans = 1

    #continue while we are either in a POS tag or a translation tag
    while (curTag != '\n'): 
        if (curTag.attrs["class"] == ["list1"]): #a translation
            temp = {} #temp dict to store each translation of a word

            #getting the raw translation
            if (curTag.find('b').string is not None): #empty translation cell
                temp['translation'] = re.split('[,;]', curTag.find('b').string)
                temp['POS'] = curPos

                #retrieving the examples of each of the translations
                try:
                    examples = curTag.findAll("ul", class_= "list2")
                    temp2 = {} #another dict to store each examples of a translation

                    numExamples = len(list(examples))
                    temp2['nums'] = numExamples

                    for j in range(0, numExamples):
                        temp2['context ' + str(j+1)] = examples[j].find(class_ = "example-original").string

                        #retrieving the usage of each examples
                        try: 
                            tempString = str(examples[j].find("li")) #extracting the usage
                            temp2['usage ' + str(j+1)] = tempString[tempString.find("<br/>")+5:-5]
                        except:
                            temp2['usage ' + str(j+1)] = "N/A"

                    temp['examples'] = temp2
                except:
                    temp['examples'] = "N/A"

                translations[numTrans] = temp #adding the translation to the dictionary

                numTrans += 1
            curTag = curTag.next_sibling                
        else: #a tag
            curPos = curTag.string.split(' \xa0')[0]
            curTag = curTag.next_sibling

    #storing the number of translations
    translations['nums'] = numTrans - 1
    return translations

In [7]:
url = "https://vdict.com/" + quote("có") + ",2,0,0.html" #resolving the IRI issue

try:
    source = urllib.request.urlopen(url)
except:
    print( "Link broken")
soup = BeautifulSoup(source, 'lxml')

In [8]:
soup.find("div", class_ = 'phanloai').next_sibling.next_sibling.next_sibling.next_sibling

'\n'

In [9]:
dictionary = {}

In [None]:
start_time = time.time()
for word in wordList:
    dictionary[word] = findTranslation(word)
    time.sleep(1) #wait for 1 sec between each query
print("--- %s seconds ---" % (time.time() - start_time))

Finding translation for của
Finding translation for và
Finding translation for là
Finding translation for có
Finding translation for không
Finding translation for một
Finding translation for người
Finding translation for cho
Finding translation for trong
Finding translation for được
Finding translation for những
Finding translation for các
Finding translation for đã
Finding translation for với
Finding translation for tôi
Finding translation for để
Finding translation for như
Finding translation for về
Finding translation for này
Finding translation for ra
Finding translation for làm
Finding translation for Việt
Finding translation for thì
Finding translation for thể
Finding translation for ở
Finding translation for cũng
Finding translation for phải
Finding translation for khi
Finding translation for đến
Finding translation for lại
Finding translation for sự
Finding translation for đó
Finding translation for vào
Finding translation for mà
Finding translation for Nam
Finding translation 

Finding translation for tổ
Finding translation for xe
Not in the dictionary xe
Finding translation for đổi
Finding translation for Hoa
Finding translation for cứu
Finding translation for tên
Finding translation for hóa
Finding translation for thêm
Finding translation for Nội
Finding translation for đẹp
Finding translation for thương
Finding translation for thân
Finding translation for phẩm
Finding translation for thay
Finding translation for mẹ
Finding translation for câu
Finding translation for máy
Finding translation for xã
Finding translation for xuất
Finding translation for cứ
Finding translation for phương
Finding translation for nghĩ
Finding translation for tưởng
Finding translation for bình
Finding translation for ban
Finding translation for lượng
Finding translation for Có
Finding translation for bỏ
Finding translation for tất
Finding translation for Nếu
Finding translation for gọi
Finding translation for xin
Finding translation for mục
Finding translation for chung
Finding tra

Finding translation for bàn
Finding translation for đầy
Finding translation for chuyển
Finding translation for Nhà
Finding translation for sở
Finding translation for sư
Finding translation for gửi
Finding translation for nhu
Finding translation for chương
Finding translation for Bộ
Finding translation for chồng
Finding translation for khai
Finding translation for hữu
Finding translation for kẻ
Finding translation for Tuy
Finding translation for du
Finding translation for CSVN
Not in the dictionary CSVN
Finding translation for trời
Finding translation for chịu
Finding translation for Hải
Finding translation for quy
Finding translation for đội
Finding translation for khỏi
Finding translation for Sản
Finding translation for Đông
Not in the dictionary Đông
Finding translation for chia
Finding translation for khu
Finding translation for dễ
Finding translation for Điều
Not in the dictionary Điều
Finding translation for lẽ
Finding translation for vợ
Finding translation for đứng
Finding transl

Not in the dictionary nguyện
Finding translation for lương
Finding translation for hạnh
Finding translation for ác
Finding translation for nguy
Not in the dictionary nguy
Finding translation for Sài
Finding translation for quần
Finding translation for lao
Finding translation for kia
Finding translation for trận
Finding translation for chú
Finding translation for dầu
Finding translation for sẻ
Finding translation for khẩu
Finding translation for can
Finding translation for lạ
Finding translation for tướng
Finding translation for đạt
Finding translation for D
Not in the dictionary D
Finding translation for tỏ
Finding translation for phim
Finding translation for tuyển
Finding translation for thất
Not in the dictionary thất
Finding translation for cư
Finding translation for Gia
Finding translation for suốt
Finding translation for cuốn
Finding translation for mùa
Finding translation for bi
Finding translation for cu
Finding translation for đào
Finding translation for Thứ
Finding translation

Finding translation for rẻ
Finding translation for lửa
Finding translation for kêu
Finding translation for chắn
Finding translation for Cái
Finding translation for trào
Finding translation for dưỡng
Finding translation for Tâm
Finding translation for Mặt
Finding translation for mại
Not in the dictionary mại
Finding translation for cong
Finding translation for Quang
Finding translation for khắp
Finding translation for Thầy
Finding translation for đấy
Finding translation for nhũng
Finding translation for giam
Finding translation for Giang
Finding translation for huy
Not in the dictionary huy
Finding translation for cam
Finding translation for Vậy
Finding translation for giết
Finding translation for Yến
Finding translation for Vào
Finding translation for tấn
Finding translation for tuyến
Finding translation for tim
Finding translation for USD
Not in the dictionary USD
Finding translation for nghề
Finding translation for dịp
Finding translation for thuyền
Finding translation for buộc
Findi

Finding translation for tạm
Finding translation for tỉ
Not in the dictionary tỉ
Finding translation for tung
Finding translation for Thư
Finding translation for rượu
Finding translation for địch
Finding translation for đế
Finding translation for Hương
Finding translation for Đăng
Not in the dictionary Đăng
Finding translation for hào
Finding translation for Quyền
Finding translation for hấp
Finding translation for chục
Finding translation for sàng
Finding translation for rừng
Finding translation for chánh
Finding translation for Tìm
Finding translation for Bí
Finding translation for lành
Finding translation for khảo
Finding translation for dứt
Finding translation for nhẹ
Finding translation for dau
Not in the dictionary dau
Finding translation for van
Finding translation for Tiếng
Finding translation for Câu
Finding translation for Chuyện
Finding translation for dàng
Not in the dictionary dàng
Finding translation for Viet
Not in the dictionary Viet
Finding translation for VNCH
Not in t

Finding translation for quận
Finding translation for VT
Not in the dictionary VT
Finding translation for Tăng
Finding translation for ấm
Finding translation for mái
Finding translation for lặng
Finding translation for bữa
Finding translation for bắn
Finding translation for bảng
Finding translation for sa
Finding translation for khen
Finding translation for hot
Not in the dictionary hot
Finding translation for ch
Not in the dictionary ch
Finding translation for Tam
Finding translation for đập
Finding translation for vinh
Finding translation for tân
Not in the dictionary tân
Finding translation for tái
Finding translation for thả
Finding translation for Bây
Finding translation for Đào
Not in the dictionary Đào
Finding translation for trứng
Finding translation for tao
Finding translation for phiếu
Finding translation for nấu
Finding translation for lão
Finding translation for hủy
Finding translation for hoà
Finding translation for Tòa
Not in the dictionary Tòa
Finding translation for trôn

Not in the dictionary viễn
Finding translation for ngũ
Finding translation for mộng
Finding translation for mươi
Finding translation for hận
Finding translation for Tuấn
Not in the dictionary Tuấn
Finding translation for Thiết
Finding translation for kiệm
Finding translation for cãi
Finding translation for Thuật
Finding translation for ruột
Finding translation for nô
Finding translation for lang
Finding translation for Đặc
Not in the dictionary Đặc
Finding translation for ôm
Finding translation for nỗ
Not in the dictionary nỗ
Finding translation for nhắn
Finding translation for mẽ
Not in the dictionary mẽ
Finding translation for lí
Not in the dictionary lí
Finding translation for he
Finding translation for giấc
Finding translation for ĐCSVN
Not in the dictionary ĐCSVN
Finding translation for vườn
Finding translation for tật
Finding translation for khán
Finding translation for chậm
Finding translation for sáu
Finding translation for im
Finding translation for huấn
Not in the dictionary 

Finding translation for Tu
Finding translation for Lần
Finding translation for Biển
Finding translation for Bao
Finding translation for mũi
Finding translation for cõi
Finding translation for Tấn
Finding translation for Hưng
Not in the dictionary Hưng
Finding translation for CÔNG
Not in the dictionary CÔNG
Finding translation for Ăn
Not in the dictionary Ăn
Finding translation for vỡ
Finding translation for trữ
Finding translation for tre
Finding translation for nắng
Finding translation for khái
Finding translation for hỏa
Finding translation for bụng
Finding translation for bấy
Finding translation for Thăng
Finding translation for thuẫn
Not in the dictionary thuẫn
Finding translation for hảo
Finding translation for cau
Finding translation for bảy
Finding translation for Chân
Finding translation for ám
Finding translation for sàn
Finding translation for ro
Not in the dictionary ro
Finding translation for ngọn
Finding translation for ke
Finding translation for cach
Not in the dictionary

Finding translation for Online
Not in the dictionary Online
Finding translation for Ngôi
Finding translation for Hạ
Finding translation for CÓ
Not in the dictionary CÓ
Finding translation for đai
Finding translation for tuệ
Not in the dictionary tuệ
Finding translation for nồng
Finding translation for não
Finding translation for bịp
Finding translation for Vị
Finding translation for TIN
Finding translation for Quyết
Finding translation for Mạnh
Finding translation for K
Not in the dictionary K
Finding translation for Cá
Finding translation for Bị
Finding translation for tẩy
Finding translation for mờ
Finding translation for má
Finding translation for hi
Not in the dictionary hi
Finding translation for ghế
Finding translation for bão
Finding translation for Huyền
Finding translation for ức
Finding translation for đeo
Finding translation for ngừa
Finding translation for nghịch
Finding translation for khổng
Not in the dictionary khổng
Finding translation for khac
Not in the dictionary kha

Finding translation for cuả
Not in the dictionary cuả
Finding translation for cs
Not in the dictionary cs
Finding translation for bầy
Finding translation for Thí
Finding translation for Song
Finding translation for Phân
Finding translation for Mao
Not in the dictionary Mao
Finding translation for trục
Finding translation for rác
Finding translation for facebook
Not in the dictionary facebook
Finding translation for diệu
Finding translation for bông
Finding translation for bóp
Finding translation for Xây
Finding translation for Quả
Finding translation for Lenin
Not in the dictionary Lenin
Finding translation for Hợp
Finding translation for Hầu
Finding translation for Cửa
Finding translation for CO
Finding translation for rắn
Finding translation for phuc
Not in the dictionary phuc
Finding translation for mạn
Finding translation for móc
Finding translation for liêng
Not in the dictionary liêng
Finding translation for khuyết
Finding translation for khinh
Finding translation for Sàigòn
Not 

Finding translation for nhàng
Not in the dictionary nhàng
Finding translation for mũ
Finding translation for mi
Not in the dictionary mi
Finding translation for lien
Not in the dictionary lien
Finding translation for khôi
Not in the dictionary khôi
Finding translation for hỏng
Finding translation for gác
Finding translation for dc
Not in the dictionary dc
Finding translation for châm
Finding translation for Ngôn
Not in the dictionary Ngôn
Finding translation for NV
Not in the dictionary NV
Finding translation for Cố
Finding translation for xá
Finding translation for kha
Not in the dictionary kha
Finding translation for giường
Finding translation for dở
Finding translation for dập
Finding translation for cắp
Finding translation for chấn
Finding translation for Vang
Finding translation for Tô
Finding translation for Thì
Finding translation for Rating
Not in the dictionary Rating
Finding translation for Ki-tô
Not in the dictionary Ki-tô
Finding translation for Khổng
Not in the dictionary 

Finding translation for nhuận
Not in the dictionary nhuận
Finding translation for ngõ
Finding translation for lung
Finding translation for gạch
Finding translation for dắt
Finding translation for Sứ
Finding translation for Nạn
Finding translation for Bảy
Finding translation for Botswana
Not in the dictionary Botswana
Finding translation for đòn
Finding translation for rong
Finding translation for phiền
Finding translation for loan
Not in the dictionary loan
Finding translation for hôi
Finding translation for dàn
Finding translation for chiêm
Finding translation for Vẫn
Finding translation for Lục
Finding translation for Luận
Finding translation for CHỦ
Not in the dictionary CHỦ
Finding translation for Bảng
Finding translation for đằng
Finding translation for đinh
Finding translation for tạ
Finding translation for lắp
Finding translation for giày
Finding translation for chang
Not in the dictionary chang
Finding translation for VĂN
Not in the dictionary VĂN
Finding translation for Tám
Fi

Finding translation for cuoi
Not in the dictionary cuoi
Finding translation for chợt
Finding translation for TV
Not in the dictionary TV
Finding translation for Phi-e-rơ
Not in the dictionary Phi-e-rơ
Finding translation for Nỗi
Finding translation for Nhan
Not in the dictionary Nhan
Finding translation for CD
Not in the dictionary CD
Finding translation for xít
Not in the dictionary xít
Finding translation for xào
Finding translation for tánh
Finding translation for tie
Not in the dictionary tie
Finding translation for thoai
Not in the dictionary thoai
Finding translation for quyen
Not in the dictionary quyen
Finding translation for phat
Not in the dictionary phat
Finding translation for nạt
Finding translation for nhái
Finding translation for nghiệt
Not in the dictionary nghiệt
Finding translation for múa
Finding translation for mét
Finding translation for méo
Finding translation for lề
Finding translation for gõ
Finding translation for fan
Not in the dictionary fan
Finding translati

Finding translation for Bách
Finding translation for ư
Not in the dictionary ư
Finding translation for đỗ
Finding translation for vỗ
Finding translation for thối
Finding translation for rủ
Finding translation for phiêu
Not in the dictionary phiêu
Finding translation for nại
Finding translation for nhì
Finding translation for lỏng
Finding translation for khoang
Finding translation for gót
Finding translation for giup
Not in the dictionary giup
Finding translation for cóc
Finding translation for Xác
Finding translation for Tú
Not in the dictionary Tú
Finding translation for Truyện
Finding translation for Thức
Finding translation for Thịt
Finding translation for THÔNG
Not in the dictionary THÔNG
Finding translation for Sony
Not in the dictionary Sony
Finding translation for Nhanh
Finding translation for NG
Not in the dictionary NG
Finding translation for Guinée
Not in the dictionary Guinée
Finding translation for Giảm
Finding translation for Cục
Finding translation for Cải
Finding transla

Finding translation for TẾ
Not in the dictionary TẾ
Finding translation for Stalin
Not in the dictionary Stalin
Finding translation for Quần
Finding translation for Philippines
Not in the dictionary Philippines
Finding translation for Mạc
Finding translation for Myanmar
Not in the dictionary Myanmar
Finding translation for MP
Not in the dictionary MP
Finding translation for Hiển
Not in the dictionary Hiển
Finding translation for HQ
Not in the dictionary HQ
Finding translation for Go
Finding translation for Estonia
Not in the dictionary Estonia
Finding translation for Chau
Finding translation for BẠN
Not in the dictionary BẠN
Finding translation for Bac
Not in the dictionary Bac
Finding translation for Đâu
Not in the dictionary Đâu
Finding translation for xỉ
Finding translation for xi
Finding translation for tuỳ
Not in the dictionary tuỳ
Finding translation for trợn
Finding translation for rộn
Finding translation for rễ
Finding translation for nản
Finding translation for nguyệt
Not in t

Finding translation for Bluetooth
Not in the dictionary Bluetooth
Finding translation for BAO
Finding translation for ách
Finding translation for vú
Finding translation for up
Not in the dictionary up
Finding translation for sét
Finding translation for rợ
Finding translation for rớt
Finding translation for oi
Finding translation for nhôm
Finding translation for nguội
Finding translation for ngoi
Finding translation for mốc
Finding translation for lầu
Finding translation for kênh
Finding translation for dốc
Finding translation for dem
Not in the dictionary dem
Finding translation for chao
Finding translation for bỏng
Finding translation for benh
Not in the dictionary benh
Finding translation for Thừa
Finding translation for Thụ
Not in the dictionary Thụ
Finding translation for THỂ
Not in the dictionary THỂ
Finding translation for Sáng-thế
Not in the dictionary Sáng-thế
Finding translation for OP
Not in the dictionary OP
Finding translation for NLĐO
Not in the dictionary NLĐO
Finding tra

Not in the dictionary VI
Finding translation for USA
Not in the dictionary USA
Finding translation for TÔI
Not in the dictionary TÔI
Finding translation for Tuệ
Not in the dictionary Tuệ
Finding translation for Tran
Not in the dictionary Tran
Finding translation for Tim
Finding translation for Thuỷ
Not in the dictionary Thuỷ
Finding translation for Sô
Finding translation for Suốt
Finding translation for Nền
Finding translation for Nghĩ
Finding translation for NHIỀU
Not in the dictionary NHIỀU
Finding translation for Mối
Finding translation for Máu
Finding translation for Miên
Not in the dictionary Miên
Finding translation for Ky
Finding translation for Federer
Not in the dictionary Federer
Finding translation for Dien
Not in the dictionary Dien
Finding translation for Cẩm
Finding translation for Cầm
Finding translation for Chuẩn
Finding translation for Cai
Finding translation for CNTT
Not in the dictionary CNTT
Finding translation for CHÚNG
Not in the dictionary CHÚNG
Finding translati

Finding translation for liệm
Finding translation for khan
Finding translation for giòng
Not in the dictionary giòng
Finding translation for gif
Not in the dictionary gif
Finding translation for comment
Not in the dictionary comment
Finding translation for chót
Finding translation for chênh
Finding translation for chien
Not in the dictionary chien
Finding translation for chiec
Not in the dictionary chiec
Finding translation for chie
Not in the dictionary chie
Finding translation for búa
Finding translation for bieu
Not in the dictionary bieu
Finding translation for You
Not in the dictionary You
Finding translation for Vượt
Finding translation for Vòng
Finding translation for Tấm
Finding translation for TẠI
Not in the dictionary TẠI
Finding translation for TÂM
Not in the dictionary TÂM
Finding translation for THÁP
Not in the dictionary THÁP
Finding translation for Rô-ma
Not in the dictionary Rô-ma
Finding translation for Phường
Finding translation for Phía
Finding translation for PV
Not 

Finding translation for Vn
Not in the dictionary Vn
Finding translation for Tuân
Not in the dictionary Tuân
Finding translation for Tuyết
Finding translation for Thỉnh
Finding translation for Thăm
Finding translation for Teen
Not in the dictionary Teen
Finding translation for THỬ
Not in the dictionary THỬ
Finding translation for THƯƠNG
Not in the dictionary THƯƠNG
Finding translation for Phối
Not in the dictionary Phối
Finding translation for Olympic
Not in the dictionary Olympic
Finding translation for Ngu
Finding translation for Mừng
Finding translation for Mức
Finding translation for Lượt
Finding translation for Ki
Finding translation for James
Not in the dictionary James
Finding translation for HÀNG
Not in the dictionary HÀNG
Finding translation for Huấn
Not in the dictionary Huấn
Finding translation for Honda
Not in the dictionary Honda
Finding translation for HY
Not in the dictionary HY
Finding translation for George
Not in the dictionary George
Finding translation for GHz
Not in

Not in the dictionary etc
Finding translation for dể
Not in the dictionary dể
Finding translation for dê
Finding translation for cụt
Finding translation for chồn
Finding translation for chướng
Finding translation for catwalk
Not in the dictionary catwalk
Finding translation for bừa
Finding translation for bẻ
Finding translation for bái
Finding translation for Vịnh
Finding translation for VÀO
Not in the dictionary VÀO
Finding translation for VietNam
Not in the dictionary VietNam
Finding translation for VietGian
Not in the dictionary VietGian
Finding translation for Tần
Not in the dictionary Tần
Finding translation for Rao
Finding translation for Quỷ
Finding translation for Phỏng
Finding translation for NhanDan
Not in the dictionary NhanDan
Finding translation for Miễn
Finding translation for Me
Finding translation for London
Not in the dictionary London
Finding translation for Kêu
Finding translation for Kitô
Not in the dictionary Kitô
Finding translation for Khí
Finding translation for

In [None]:
dictionary

In [None]:
with open('vdict VI-EN.json', 'w') as outfile:
    json.dump(dictionary, outfile)

In [None]:
findTranslation("cái")