In [2]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import csv
import re
import sys
import nltk
from nltk.corpus import stopwords

# FUNGSI PREPROCESSING

In [3]:
def openFile(file_name):
    with open(file_name, encoding="utf-8") as csvfile: 
        next(csvfile)
        rawArticles = csv.reader(csvfile, delimiter=',') 
        words = [] #array perkata akan menyimpan semua kalimat menjadi perkata
        sentences = [] #array perkalimat menyimpan semua kalimat menjadi perkalimat
        sentiment = []
        for row in rawArticles:
            sentences=row[3].lower()
            words.append(sentences.split())
            sentiment.append(row[5].lower())
    return words, sentiment 


In [4]:
def getStopWordsList(stopwordsfile):
    stopwords=[]
    file_stopwords = open(stopwordsfile,'r')
    row = file_stopwords.readline()
    while row:
        word = row.strip()
        stopwords.append(word)
        row = file_stopwords.readline()
    file_stopwords.close()
    return stopwords

In [5]:
def getEmojiHandling(review):
    emoji = []
    for word in review:
        #Smile -- :), : ), :-), (:, ( :, (-:, :')
        word = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))','POS',word)
        
        #Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
        word = re.sub(r'(:\s?D|:-D|x-?D|X-?D)','POS',word)

        # Sad -- :-(, : (, :(, ):, )-:
        word = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' NEG ', word)

        # Cry -- :,(, :'(, :"(, T_T
        word = re.sub(r'(:,\(|:\'\(|:"\(|T_T)', ' NEG ', word)

        emoji.append(word)
    return emoji

In [6]:
def getPunctHandling(review):
    #menghilangkan tanda baca
    preprocess_review = []
    for word in review:
        word = word.strip('\'"?!,.():;')

        #mengkonversi huruf vocal lebih dari satu dan berurutan
        word_character = re.compile(r"(.)\1+", re.DOTALL)
        word = word_character.sub(r"\1\1", word)

        #menghilangkan tanda - & '
        word = re.sub(r'(-|\')','',word)

        preprocess_review.append(word.lower())
    return preprocess_review

In [7]:
def getStemmingSentence(review):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return stemmer.stem(review)


In [8]:
def getFeatureVector(review,stop_words_indo,stop_words_eng):
    feature_vector = []
    list_no = ['ga','engga','enggak','gak','nggak','ngga','tdk']
    for word in review:
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", word) #menghilangkan karakter selain huruf didalam kata
        if (word in stop_words_indo or val is None or word in stop_words_eng):
            continue
        else:
            if word in list_no:
                word = 'tidak'
            feature_vector.append(word)
    for_stemming = ' '.join(feature_vector)
    return feature_vector, for_stemming


In [9]:
def getNegativeHandling(review):
    negative_review = []
    for i in range(len(review)):
        word = review[i]
        if review[i-1] != 'tidak':
            negative_review.append(word)
        else:
            word = 'tidak_'+word
            negative_review.append(word)
    return negative_review

In [10]:
def preprocessReview(review):
    return getPunctHandling(getEmojiHandling(review))

In [11]:
def createFreqDict(reviewHandled):
    freqOfWord = {}
    for sentence in reviewHandled:
        for word in sentence:
            if word in freqOfWord:
                freqOfWord[word] += 1
            else:
                freqOfWord[word] = 1
    file_key = open('keys.txt','w')
    for key in freqOfWord.keys():
        file_key.write(str(key))
        file_key.write("\n")
    file_key.close()
    return freqOfWord

In [80]:
def getFeatureExtraction(review):
    words = set(review)
    features = {}
    for word in feature_list.keys():
        features['contains(%s)' % word] = (word in words) 
    return features

# BACA DATA

In [78]:
corpus_train, sentiments_train = openFile('50-Data-Train.csv')
reviews_test, sentiments_test = openFile('5-Data-Test.csv')
stop_words_indo = getStopWordsList('stopwordsindo.txt')
stop_words_eng = stopwords.words('english')

# TRAINING DATA

In [81]:
%%time
preprocess_reviews = []
tokens = []
reviews = []
handled_reviews = []
feature_list = []
for review in corpus_train:
    feature, review_for_stem = getFeatureVector(preprocessReview(review),stop_words_indo,stop_words_eng)
    preprocess_reviews.append(getStemmingSentence(review_for_stem))

for review in preprocess_reviews:
    tokens.append(nltk.word_tokenize(review))

for i in range(len(tokens)):
    neg_handled_rev = getNegativeHandling(tokens[i])
    handled_reviews.append(neg_handled_rev)
    reviews.append((neg_handled_rev,sentiments_train[i]))

feature_list = createFreqDict(handled_reviews)
training_set = nltk.classify.util.apply_features(getFeatureExtraction,reviews)
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)

Wall time: 1min 15s


# VALIDASI DATA TESTING

In [84]:
prediction = []
validation_test = []
for review in reviews_test:
    feature_classification, review_test_for_stem = getFeatureVector(review,stop_words_indo,stop_words_eng)
    handled_reviews_test = getNegativeHandling(feature_classification)
    classify_result = NBClassifier.classify(getFeatureExtraction(handled_reviews_test))
    prediction.append((review,classify_result))
    validation_test.append(classify_result)

In [85]:
for sentiment in prediction:
    print(sentiment)

(['pembersih', 'yang', 'nagih', 'banget,', 'gak', 'ribet', 'pake', 'cleansing', 'milk', 'n', 'toner,', 'cetaphil', 'gentle', 'cleanser', 'ini', 'sengaja', 'aq', 'pake', 'buat', 'bersihin', 'muka', 'pake', 'kapas,', 'karena', 'kalo', 'pake', 'air', 'gak', 'ngluarin', 'busa', 'jd', 'berasa', 'ada', 'yang', 'kurang,', 'tp', 'aq', 'uda', 'repurchase', '3x', ',', 'suami', 'aq', 'juga', 'yang', 'awal', 'g', 'pernah', 'bersihin', 'muka,', 'sekarang', 'maunya', 'cuma', 'dibersihin', 'pake', 'cetaphil', 'gentle', 'cleanser', 'ini.'], 'positive')
(['bener-bener', 'produk', 'facial', 'wash', 'paling', 'oke', 'kalo', 'menurut', 'saya.', 'sesuai', 'dengan', 'tagline', 'cetaphil', ':', 'every', 'age,', 'every', 'stage,', 'every', 'day,', 'produk', 'ini', 'emang', 'cocok', 'untuk', 'setiap', 'umur,', 'jenis', 'kulit', 'dan', 'perluu', 'digunakan', 'setiap', 'hari', 'oleh', 'setiap', 'orang.', 'tekstur', 'pelembabnya', 'berminyak,', 'awalnya', 'kaget', 'karena', 'pembersih', 'wajah', 'biasa', 'kan', '

# HITUNG AKURASI

In [86]:
num_true = 0
for k,val in enumerate(validation_test):
    if val==sentiments_test[k]: 
        num_true+=1
accuracy = (num_true/len(reviews_test))*100
accuracy

100.0