In [1]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import glob
from nltk.stem.snowball import SnowballStemmer
from itertools import chain #untuk flaten array 2d

In [2]:
def readFile(directory):
    listFile = glob.glob(directory+"/*.txt")
    return listFile

## normalisasi

In [3]:
def normalisasi(body):
    
    hasil = []
    temp_str = ''
    
    #hapus tag
    hapusTag = r'<DATE>|</DATE>|<TITLE>|</TITLE>|<BODY>|</BODY>'
    body = re.sub(hapusTag, ' ', body)
    
    #hapus \n
    hapusEnter = r'\n'
    body = re.sub(hapusEnter, ' ', body)
    
    #hapus spasi lebih dari satu
    spasiLebihDariSatu = r'\s+'
    body = re.sub(spasiLebihDariSatu, ' ', body)
    
    # hapus htmlEntities 
    htmlEntities = r"&lt;"
    body = re.sub(htmlEntities, '', body)
    
    #ambil tanggal
    tanggal = r"((\d{2}|\d{4})[.-](J(anuary|an|AN|une|un|UN|uly|ul|UL)|F(ebruary|eb|EB)|M(arch|ar|AR|ay|AY)|A(pril|pr|PR|ugust|ug|UG)|S(eptember|ep|EP)|O(ctober|ct|CT)|N(ovember|ov|OV)|D(ecember|ec|EC))[-.](\d{4}|\d{2}))"
    list_tanggal = re.findall(tanggal,body)
    hasil.append(list_tanggal)
    body = re.sub(tanggal, "", body)
    
    #ambil waktu
    waktu = r"(\d{2}:\d{2}:\d{2}(\.)?\d{2})"
    list_waktu = re.findall(waktu,body)
    hasil.append(list_waktu)
    body = re.sub(waktu, "", body)
    
    #ambil angka desimal, persen, mata uang 
#     desimal = "(([+-]?(\$|£)?(\d{1,3}[.,])*(\d+)(%)?))"
#     list_desimal = re.findall(desimal, body)
#     hasil.append(list_tanggal)
#     body = re.sub(desimal, "", body)
    
    #email
    email = "([\w-]+(\.[\w-]+|\.)*@[\w-]+(\.[\w-]+)+)"
    list_email = re.findall(email,body)
    hasil.append(list_email)
    body = re.sub(email, "", body)
    
    #url
    url = "((http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?)"
    list_url = re.findall(url,body)
    hasil.append(list_url)
    body = re.sub(url, "", body)
    
    tanda_baca = r'["()<>+?.\[\]{}:,\'\'/-]'
    body = re.sub(tanda_baca, ' ', body)
    
    body = re.sub(spasiLebihDariSatu, ' ', body)
    
    kutips = r"'s"
    body = re.sub(kutips, '', body)
    
    list_flat = list(chain.from_iterable(hasil))
    
    for i in list_flat:
        temp_str = temp_str + i[0] + " "
    
    body = body + " " + temp_str
    
    body = re.sub(spasiLebihDariSatu, ' ', body)
    
    return body

In [4]:
def readText(file):
    words = []
    f = open(file, 'r') #open file
    text = f.read()    
    f.close()

    return text

In [5]:
def removeEnter(listOfWords): #['<BODY>\n']
    for i in range(len(listOfWords)):
        listOfWords[i] = listOfWords[i].replace('\n','')
        
    return listOfWords

In [6]:
def removeNull(listOfWords): #['','','eat','food','','']
    listOfWords = list(filter(None, listOfWords))
    
    return listOfWords

In [7]:
def caseFolding(listOfWords):
    for i in range(len(listOfWords)):
        listOfWords[i] = listOfWords[i].casefold()
        
    return listOfWords

<h3>Load Stop Word from NLTK</h3>

In [8]:
stop_words = stopwords.words('english')

In [9]:
def removeStopWord(listOfWords):
    for i in listOfWords:
        if i in stop_words:
            listOfWords.remove(i)
        
    return listOfWords

<h3>Load Lemmatizer from NLTK</h3>

In [10]:
lemmatizer = WordNetLemmatizer()

In [11]:
def lemmatization(listOfWords):
    for i in range(len(listOfWords)):
        listOfWords[i] = lemmatizer.lemmatize(listOfWords[i])
        
    return listOfWords

<h3>Load Stemming from NLTK</h3>

In [12]:
def stemming(listOfWords):
    stemmer = SnowballStemmer("english")
    stemmed = [stemmer.stem(word) for word in listOfWords]
    
    return stemmed

# Token

In [13]:
def make_token(txt):
    list_hasil = txt.split(" ")            
    return list_hasil

<h1>Main Pre-processing</h1>

In [14]:
def preprocessing(file):
    txt = readText(file)
    txt = normalisasi(txt)
    listOfWords = make_token(txt)
    listOfWords = removeNull(listOfWords)
    listOfWords = caseFolding(listOfWords)
    listOfWords = removeStopWord(listOfWords)
    listOfWords = stemming(listOfWords)
#     listOfWords = lemmatization(listOfWords)
#     listOfWords = removeStopWord(listOfWords)
#     listOfWords = removeStopWord(listOfWords)
#     listOfWords = removeStopWord(listOfWords)
    
    return listOfWords

In [15]:
# listOfFiles = readFile('DataRouter_2')
# listOfWordsOfFile = []

# for i in listOfFiles: #iterasi tiap file
#     txt = readText(i)
#     txt = normalisasi(txt)
#     listOfWords = make_token(txt)
#     listOfWords = removeNull(listOfWords)
#     listOfWords = caseFolding(listOfWords)
#     listOfWords = removeStopWord(listOfWords)
#     listOfWords = stemming(listOfWords)
#     listOfWords = lemmatization(listOfWords)
    
# return listOfWords

<h1>Compute TF</h1>

In [16]:
#input berupa list dari text yang sudah di preprocessing
def countWord(listOfWords):
    dictWordCount = {}
    for word in listOfWords:
        if word in dictWordCount:
            dictWordCount[word] += 1
        else:
            dictWordCount[word] = 1
            
    return dictWordCount

In [17]:
import math
#tf = 1 + log(i,j)
def computeTFLog(dictWordCount):
    for i in dictWordCount:
        tf = 1 + math.log(dictWordCount[i],2)
        dictWordCount[i] = tf
    
    return dictWordCount

In [18]:
def insertDocNum(_dict, _file):
    name = _file.split('/')
    name = name[1]
    name = name.split('.')
    name = name[0]
    name = int(name[4:7])
    
    for i in _dict:
        _dict[i] = [name,_dict[i]]
        
    return _dict

In [19]:
def tf(listOfWords, _file):
    dictWordCount = countWord(listOfWords)
    dictWordCount = computeTFLog(dictWordCount)
    dictWordCount = insertDocNum(dictWordCount, _file)
    
    return dictWordCount

<h1>Compute IDF</h1>

In [20]:
import math
#idf = log(1 + (N/ni))
def countIDF(_N, _ni):
#     print('N : ',_N)
#     print('_ni : ',_ni)
    return math.log(1+(_N/_ni),2)
        

<h1>Count Weight</h1>

In [21]:
#(1 + log(f(i,j))) x log(N/ni)
def countWeight(_dict, _fileCount):
    isList2D = False
    tempIDF = 0
    
    for i in _dict:
        tempLength = len(_dict[i])
        tempIDF = countIDF(_fileCount, tempLength)
        try:
            temp = _dict[i][0][0]
            isList2D = True
        except:
            a = 0
            
        if not isList2D:#'export':[1,2]
            _dict[i][1] = _dict[i][1]*tempIDF
#             _dict[i].append(0)#tambah q di paling belakang (0)
        else:
#             print("before ",_dict[i])
            for k in range(0, tempLength-1):
#                 print('length = ',tempLength)
                _dict[i][k][1] = _dict[i][k][1]*tempIDF
#             _dict[i].append(0)#tambah q di paling belakang (0)
        
#             if tempLength == _fileCount-1:
#                 _dict[i].append(tempIDF)#tambah q di paling belakang
#             print("after ",_dict[i])
        isList2D = False
                
    return _dict

<h1>Main</h1>

In [22]:
def main(fileDir):
    fileCounter = 1
    
    wholeDictWordCount = {}
    listOfFiles = readFile(fileDir)
    listOfFiles.sort()
    for file in listOfFiles:
        listOfWord = preprocessing(file)
#         print(listOfWord)
        tempDictWordCount = tf(listOfWord, file)
        wholeDictWordCount = addToWholeDictWordCount(tempDictWordCount, wholeDictWordCount, fileCounter)
        fileCounter += 1
        #wholeDictWordCount ==> 'export':[1,2,[2,1],[3,5],[4,3]]
        
    wholeDictWordCount = fixFirstList(wholeDictWordCount)
    #wholeDictWordCount ==> 'export':[[1,2],[2,1],[3,5],[4,3]]
    
#     wholeDictWordCount = addQ(wholeDictWordCount, fileCounter) #add q
    
    wholeDictWordCount = countWeight(wholeDictWordCount, fileCounter) 
    
    saveToTxtFile(wholeDictWordCount)
    file = open('sum_of_file.txt','w')
    file.write(str(fileCounter))
    file.close()
        
#     printDict(wholeDictWordCount)

In [23]:
def addToWholeDictWordCount(_tempDict, _wholeDict, _fileCounter):
    isList2D = False
    
    if not bool(_wholeDict):#first time jalan
        return _tempDict
    else:
        for word in _wholeDict:#masukin word dari temp ke whole yg udah ada
            try:
                temp = word[0][0]
                isList2D = True
            except:
                a = 0
                
            if isList2D:
                if word in _tempDict:
                    _wholeDict[word].append(_tempDict[word])
            else:
                if word in _tempDict:
                    _wholeDict[word] = [_wholeDict[word],_tempDict[word]]
                    
            isList2D = False
                    
        for word in _tempDict:#masukin word dari temp ke whole yg belum ada
            if word not in _wholeDict:
                _wholeDict[word] = _tempDict[word]
                

    return _wholeDict

In [24]:
def fixFirstList(_dict):
    for i in _dict:
        if len(_dict[i]) > 2:
            tempList = _dict[i]
            _dict[i] = [[_dict[i][0],_dict[i][1]]]
#             print(tempList)
            for k in range(2,len(tempList)):
                _dict[i].append(tempList[k])
        else:
            _dict[i] = [_dict[i]]
            
    return _dict

In [25]:
def saveToTxtFile(_dict):
    file = open('tf-idf.txt','w')
    
    for i in sorted(_dict.keys()):
        temp = ''
        for k in _dict[i]:
#             temp += '['+str(k[0])+','+str(k[1])+']'
            temp += str(k)+';'
        file.write(i+'\t'+temp+'\n')
    
    file.close()

In [28]:
import time
start_time = time.time()
main('DataRouter')
print("--- %s seconds ---" % (time.time() - start_time))

--- 1.5878386497497559 seconds ---


In [27]:
# a = {
#     'bahia':[[1,4.3]],
#     'cocoa':[[1,1.3],[4,1]],
#     'continu':[[1,3],[2,2],[3,3],[4,4],3]
# }
# saveToTxtFile(a)
# print(a['bahia'][0][1])