In [1]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import glob
from nltk.stem.snowball import SnowballStemmer
from itertools import chain #untuk flaten array 2d

In [2]:
def readFile(directory):
    listFile = glob.glob(directory+"/*.txt")
    return listFile

## normalisasi

In [3]:
def normalisasi(body):
    
    hasil = []
    temp_str = ''

    #hapus tag
    hapusTag = r'<DATE>|</DATE>|<TITLE>|</TITLE>|<BODY>|</BODY>'
    body = re.sub(hapusTag, ' ', body)
    
    #hapus \n
    hapusEnter = r'\n'
    body = re.sub(hapusEnter, ' ', body)
    
    #hapus spasi lebih dari satu
    spasiLebihDariSatu = r'\s+'
    body = re.sub(spasiLebihDariSatu, ' ', body)
    
    # hapus htmlEntities 
    htmlEntities = r"&lt;"
    body = re.sub(htmlEntities, '', body)
    
    #ambil tanggal
    tanggal = r"((\d{2}|\d{4})[.-](J(anuary|an|AN|une|un|UN|uly|ul|UL)|F(ebruary|eb|EB)|M(arch|ar|AR|ay|AY)|A(pril|pr|PR|ugust|ug|UG)|S(eptember|ep|EP)|O(ctober|ct|CT)|N(ovember|ov|OV)|D(ecember|ec|EC))[-.](\d{4}|\d{2}))"
    list_tanggal = re.findall(tanggal,body)
    hasil.append(list_tanggal)
    body = re.sub(tanggal, "", body)
    
    #ambil waktu
    waktu = r"(\d{2}:\d{2}:\d{2}(\.)?\d{2})"
    list_waktu = re.findall(waktu,body)
    hasil.append(list_waktu)
    body = re.sub(waktu, "", body)
    
    #ambil angka desimal, persen, mata uang 
    desimal = "(([+-]?(\$|£)?(\d{1,3}[.,])*(\d+)(%)?))"
    list_desimal = re.findall(desimal, body)
    hasil.append(list_tanggal)
    body = re.sub(desimal, "", body)
    
    #email
    email = "([\w-]+(\.[\w-]+|\.)*@[\w-]+(\.[\w-]+)+)"
    list_email = re.findall(email,body)
    hasil.append(list_email)
    body = re.sub(email, "", body)
    
    #url
    url = "((http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?)"
    list_url = re.findall(url,body)
    hasil.append(list_url)
    body = re.sub(url, "", body)
    
    tanda_baca = r'["()<>+?.\[\]{}:,\'\'/-]'
    body = re.sub(tanda_baca, ' ', body)

    body = re.sub(spasiLebihDariSatu, ' ', body)
    
    kutips = r"'s"
    body = re.sub(kutips, '', body)
    
    list_flat = list(chain.from_iterable(hasil))
    
    for i in list_flat:
        temp_str = temp_str + i[0] + " "
    
    body = body + " " + temp_str
    
    body = re.sub(spasiLebihDariSatu, ' ', body)
    
    return body

In [4]:
def readText(file):
    words = []
    f = open(file, 'r') #open file
    text = f.read()    
    f.close()

    return text

In [5]:
def removeEnter(listOfWords): #['<BODY>\n']
    for i in range(len(listOfWords)):
        listOfWords[i] = listOfWords[i].replace('\n','')
        
    return listOfWords

In [6]:
def removeNull(listOfWords): #['','','eat','food','','']
    listOfWords = list(filter(None, listOfWords))
    
    return listOfWords

In [7]:
def caseFolding(listOfWords):
    for i in range(len(listOfWords)):
        listOfWords[i] = listOfWords[i].casefold()
        
    return listOfWords

<h3>Load Stop Word from NLTK</h3>

In [8]:
stop_words = stopwords.words('english')

In [9]:
def removeStopWord(listOfWords):
    for i in listOfWords:
        if i in stop_words:
            listOfWords.remove(i)
        
    return listOfWords

<h3>Load Lemmatizer from NLTK</h3>

In [10]:
lemmatizer = WordNetLemmatizer()

In [11]:
def lemmatization(listOfWords):
    for i in range(len(listOfWords)):
        listOfWords[i] = lemmatizer.lemmatize(listOfWords[i])
        
    return listOfWords

<h3>Load Stemming from NLTK</h3>

In [12]:
def stemming(listOfWords):
    stemmer = SnowballStemmer("english")
    stemmed = [stemmer.stem(word) for word in listOfWords]
    
    return stemmed

# Token

In [13]:
def make_token(txt):
    list_hasil = txt.split(" ")            
    return list_hasil

<h1>Main Pre-processing</h1>

In [82]:
def preprocessing(file):
    txt = readText(file)
    txt = normalisasi(txt)
    listOfWords = make_token(txt)
    listOfWords = removeNull(listOfWords)
    listOfWords = caseFolding(listOfWords)
    listOfWords = removeStopWord(listOfWords)
    listOfWords = stemming(listOfWords)
    listOfWords = lemmatization(listOfWords)
    listOfWords = removeStopWord(listOfWords)
    
    return listOfWords

In [15]:
# listOfFiles = readFile('DataRouter_2')
# listOfWordsOfFile = []

# for i in listOfFiles: #iterasi tiap file
#     txt = readText(i)
#     txt = normalisasi(txt)
#     listOfWords = make_token(txt)
#     listOfWords = removeNull(listOfWords)
#     listOfWords = caseFolding(listOfWords)
#     listOfWords = removeStopWord(listOfWords)
#     listOfWords = stemming(listOfWords)
#     listOfWords = lemmatization(listOfWords)
    
# return listOfWords

<h1>Compute TF</h1>

In [24]:
#input berupa list dari text yang sudah di preprocessing
def countWord(listOfWords):
    dictWordCount = {}
    for word in listOfWords:
        if word in dictWordCount:
            dictWordCount[word] += 1
        else:
            dictWordCount[word] = 1
            
    return dictWordCount

In [55]:
import math
#tf = 1 + log(i,j)
def computeTFLog(dictWordCount):
    for i in dictWordCount:
        tf = 1 + math.log(dictWordCount[i],2)
        dictWordCount[i] = [tf]
    
    return dictWordCount

In [18]:
def tf(listOfWords):
    dictWordCount = countWord(listOfWords)
    dictWordCount = computeTFLog(dictWordCount)
    
    return dictWordCount

<h1>Compute IDF</h1>

In [None]:
def countIDF(_wholeDict):
    for i in _wholeDict:
        

<h1>Main</h1>

In [71]:
def main(fileDir):
    fileCounter = 1
    
    wholeDictWordCount = {}
    listOfFiles = readFile(fileDir)
    for file in listOfFiles:
        listOfWord = preprocessing(file)
        tempDictWordCount = tf(listOfWord)
        wholeDictWordCount = addToWholeDictWordCount(tempDictWordCount, wholeDictWordCount, fileCounter)
        fileCounter += 1
        
    

In [77]:
def addToWholeDictWordCount(_tempDict, _wholeDict, _fileCounter):
#     lengthCol = len(_wholeDict[next(iter(_wholeDict))])
    
    if not bool(_wholeDict):#first time jalan
        return _tempDict
    else:
        for word in _wholeDict:#masukin word dari temp ke whole
            if word in _tempDict:
                _wholeDict[word].extend(_tempDict[word])
            else:
                _wholeDict[word].extend([0])
                
        for word in _tempDict:
            if word not in _wholeDict:
                _wholeDict[word] = [0]
                for i in range(_fileCounter-2):#kasih panjang sesuai array whole
                    _wholeDict[word].extend([0])
                _wholeDict[word].extend(_tempDict[word])

        return _wholeDict

In [83]:
main('DataRouter_2')

standard [3.321928094887362, 0]
oil [3.0, 0]
srd [1.0, 0]
form [2.0, 0]
financi [2.0, 0]
unit [1.0, 0]
co [2.0, 0]
bp [3.0, 0]
north [2.0, 0]
america [2.0, 0]
inc [1.0, 0]
said [1.0, 3.321928094887362]
plan [1.0, 0]
ventur [2.0, 0]
manag [2.0, 0]
money [1.0, 0]
market [1.0, 0]
borrow [1.0, 0]
invest [1.0, 0]
activ [1.0, 0]
compani [1.0, 0]
subsidiari [1.0, 0]
british [1.0, 0]
petroleum [1.0, 0]
plc [1.0, 0]
also [1.0, 1.0]
pct [1.0, 0]
interest [1.0, 0]
call [1.0, 0]
trade [1.0, 1.0]
be [1.0, 2.0]
oper [1.0, 0]
oversight [1.0, 0]
a [1.0, 0]
joint [1.0, 0]
committe [1.0, 0]
26-feb-1987 [2.0, 2.0]
15:02:20.00 [1.0, 0]
bahia [0, 3.321928094887362]
cocoa [0, 3.807354922057604]
review [0, 2.0]
shower [0, 1.0]
continu [0, 1.0]
throughout [0, 1.0]
week [0, 3.0]
zone [0, 1.0]
allevi [0, 1.0]
drought [0, 1.0]
sinc [0, 1.0]
earli [0, 1.0]
januari [0, 1.0]
improv [0, 1.0]
prospect [0, 1.0]
come [0, 2.0]
temporao [0, 2.0]
although [0, 1.0]
normal [0, 1.0]
humid [0, 1.0]
level [0, 1.0]
restor [0, 1