In [1]:
import glob
import re
import json
import time

In [2]:
start_time = time.time()

In [3]:
wordsFiles = glob.glob('./Extracted Freq List/*')

In [4]:
wordsFiles

['./Extracted Freq List/freqList300_399',
 './Extracted Freq List/freqList100_199',
 './Extracted Freq List/freqList00_99',
 './Extracted Freq List/freqList600_699',
 './Extracted Freq List/freqList700_799',
 './Extracted Freq List/freqList400_499',
 './Extracted Freq List/freqList500_599',
 './Extracted Freq List/freqList200_299']

In [5]:
def extractWordFreq(line):
    temp = line.strip().split(' ')
    if (len(temp) == 2):
        return temp[0], temp[1] #freq, item
    else:
        return temp[0], "@" #freq, random character to make sure this item is not chosen

In [6]:
def containsAnyChar(word):
    return (re.search('[a-zA-Z]', word) is not None)

In [7]:
containsAnyChar("ấn")

True

In [8]:
containsAnyChar("12")

False

In [9]:
def containsForbiddenChar(word):
    return (re.search('[@=\{\}]', word) is not None)

In [10]:
containsForbiddenChar("^ak")

False

In [11]:
containsForbiddenChar("{ak")

True

In [12]:
def containsLink(word):
    return (("http" in word) or (".com" in word) or (".vn" in word) or (".edu" in word)) 

In [13]:
containsLink("http:safsa")

True

In [14]:
containsLink("asf.edu")

True

In [15]:
#split by special characters 
def splitWord(word):
    return re.split(r'[\W]', word)

In [16]:
splitWord("word..word")

['word', '', 'word']

In [17]:
splitWord("xhdc-sad")

['xhdc', 'sad']

In [18]:
splitWord("hello")

['hello']

In [19]:
#remove any special characters but space, lower case the word and strip trailing spaces
def cleanWord(word):
    return re.sub(r'[^\w ]+', '', word).lower().strip()

In [20]:
cleanWord("hấn%^^&")

'hấn'

In [21]:
cleanWord("hấn hấn")

'hấn hấn'

In [22]:
cleanWord("hấn ")

'hấn'

In [23]:
freqList = {}

In [None]:
for wordsFile in wordsFiles:
    print(wordsFile)
    with open(wordsFile, 'r') as infile:
        lines = infile.readlines()
        for line in lines:
            freq, item = extractWordFreq(line) #extracting the frequency and item from each line
            
            #item contains at least one character
            #does not contain any forbidden characters
            #not a link
            if (containsAnyChar(item) and not containsForbiddenChar(item) and not containsLink(item)): 
                #split by special characters 
                words = splitWord(item) 
                for word in words:
                    if (containsAnyChar(word)):
                        word = cleanWord(word)
                    else:
                        continue
                    if word not in freqList: #initialize the word if it has not been found
                        freqList[word] = int(freq)
                    else: #add to the frequency if the word is already in there
                        freqList[word] = int(freqList[word]) + int(freq)

./Extracted Freq List/freqList300_399


In [None]:
len(freqList.items())

In [None]:
engList = {}
with open('./engWordList', 'r') as infile:
    lines = infile.readlines()
    for line in lines[0:30000]:
        freq, word = extractWordFreq(line) #extracting the word and frequency from each line
        
        if word not in engList: #initialize the word if it has not been found
            engList[word] = int(freq)
        else: #add to the frequency if the word is already in there
            engList[word] = int(engList[word]) + int(freq)

In [None]:
for engWord in engList.keys():
    temp = engWord.lower()
    if temp in freqList: #if an english word is found, remove it from the dictionary
        freqList.pop(temp)

In [None]:
len(freqList.items())

In [None]:
sortedList = sorted(freqList.items(), key=lambda x: x[1], reverse=True)

In [None]:
with open('./vietWordList', 'w') as outfile:
    for (word, freq) in sortedList:
        outfile.write(str(freq) + " " + word + "\n")

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))