In [1]:
import glob
import re
import json

In [2]:
wordsFiles =glob.glob('./Viet Vocab/*')

In [3]:
wordsFiles

['./Viet Vocab/freqList100_199',
 './Viet Vocab/freqList300_399',
 './Viet Vocab/freqList200_299',
 './Viet Vocab/freqList700_799',
 './Viet Vocab/freqList00_99',
 './Viet Vocab/freqList600_699',
 './Viet Vocab/freqList500_599',
 './Viet Vocab/freqList400_499']

In [4]:
def extractWordFreq(line):
    temp = line.strip().split(' ')
    if (len(temp) == 2):
        return temp[0], temp[1] #freq, word
    else:
        return temp[0], "@" #freq, random character to make sure this word is not chosen

In [5]:
def containsAnyChar(word):
    return (re.search('[a-zA-Z]', word) is not None)

In [6]:
containsAnyChar("ấn")

True

In [7]:
containsAnyChar("12")

False

In [8]:
def containsForbiddenChar(word):
    return (re.search('[&@$#%^<>=\{\}]', word) is not None)

In [9]:
containsForbiddenChar("^ak")

True

In [10]:
def cleanWord(word):
    return re.sub(r'[^\w]', '', word).lower()

In [11]:
cleanWord("hấn%^^&")

'hấn'

In [12]:
freqList = {}

In [13]:
for wordsFile in wordsFiles:
    with open(wordsFile, 'r') as infile:
        lines = infile.readlines()
        for line in lines:
            freq, word = extractWordFreq(line) #extracting the word and frequency from each line
            
            #if the word contains at least one character and it does not contain any forbidden characters
            if (containsAnyChar(word) and not containsForbiddenChar(word)): 
                word = cleanWord(word)
                if word not in freqList: #initialize the word if it has not been found
                    freqList[word] = int(freq)
                else: #add to the frequency if the word is already in there
                    freqList[word] = int(freqList[word]) + int(freq)

In [14]:
freqList

{'và': 41964967,
 'của': 34197786,
 'có': 34399345,
 'các': 31469021,
 'là': 26945267,
 'được': 24715398,
 'cho': 21633241,
 'với': 22682183,
 'trong': 23021067,
 'không': 20632578,
 'một': 18579659,
 'những': 17009113,
 'công': 18419273,
 'người': 16242779,
 'thể': 13666407,
 'đến': 11677369,
 'hàng': 11692886,
 'khi': 11855860,
 'bạn': 13427216,
 'sẽ': 10370491,
 'tại': 11155957,
 'từ': 11043322,
 'về': 11356742,
 'nhiều': 10090332,
 'dụng': 10327830,
 'làm': 9859596,
 'sản': 9348696,
 'trên': 9649007,
 'như': 10177145,
 'sự': 9056980,
 'hiện': 10377876,
 'động': 9657072,
 'việc': 9869211,
 'vào': 8202802,
 'khách': 7097533,
 'cũng': 8449072,
 'ra': 9074533,
 'thành': 9682576,
 'nhà': 9604625,
 'bị': 7686782,
 'hợp': 8786776,
 'đầu': 8273815,
 'số': 9221759,
 'này': 10229947,
 'năm': 8683062,
 'sử': 6684298,
 'chỉ': 7874021,
 'phải': 6991033,
 'phẩm': 6507941,
 'lại': 7233352,
 'giá': 6930232,
 'ngày': 8940782,
 'quan': 7708516,
 'nước': 7784924,
 'nhất': 7898312,
 'thông': 8656613,


In [15]:
with open('./Dict Scraping/bigFreqList.json', 'w') as outfile:
    json.dump(freqList, outfile)