In [2]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import csv
import random
import re
import sys
import nltk
from nltk.corpus import stopwords

# FUNGSI PREPROCESSING

In [17]:
def openFile(file_name):
    with open(file_name, encoding="utf-8") as csvfile: 
        next(csvfile)
        rawArticles = csv.reader(csvfile, delimiter=',') 
        all_reviews = [] 
        all_sentiment = []
        for row in rawArticles:
            all_reviews.append((row[3].lower()).split())
            all_sentiment.append(row[5].lower())
    n_reviews=len(all_reviews)
    randomize = random.sample(range(n_reviews), n_reviews)
    idx_train = randomize[:(int(n_reviews*0.8))]
    idx_test = randomize[(int(n_reviews*0.8)):]
    reviews_train = [all_reviews[idx] for idx in idx_train]
    sentiment_train = [all_sentiment[idx] for idx in idx_train]
    reviews_test = [all_reviews[idx] for idx in idx_test]
    sentiment_test = [all_sentiment[idx] for idx in idx_test]
    return reviews_train, sentiment_train, reviews_test, sentiment_test

In [5]:
def getStopWordsList(stopwordsfile):
    stopwords=[]
    file_stopwords = open(stopwordsfile,'r')
    row = file_stopwords.readline()
    while row:
        word = row.strip()
        stopwords.append(word)
        row = file_stopwords.readline()
    file_stopwords.close()
    return stopwords

In [6]:
def getEmojiHandling(review):
    emoji = []
    for word in review:
        #Smile -- :), : ), :-), (:, ( :, (-:, :')
        word = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))','POS',word)
        
        #Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
        word = re.sub(r'(:\s?D|:-D|x-?D|X-?D)','POS',word)

        # Sad -- :-(, : (, :(, ):, )-:
        word = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' NEG ', word)

        # Cry -- :,(, :'(, :"(, T_T
        word = re.sub(r'(:,\(|:\'\(|:"\(|T_T)', ' NEG ', word)

        emoji.append(word)
    return emoji

In [7]:
def getPunctHandling(review):
    #menghilangkan tanda baca
    preprocess_review = []
    for word in review:
        word = word.strip('\'"?!,.():;')

        #mengkonversi huruf vocal lebih dari satu dan berurutan
        word_character = re.compile(r"(.)\1+", re.DOTALL)
        word = word_character.sub(r"\1\1", word)

        #menghilangkan tanda - & '
        word = re.sub(r'(-|\')','',word)

        preprocess_review.append(word.lower())
    return preprocess_review

In [8]:
def getStemmingSentence(review):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return stemmer.stem(review)


In [9]:
def getFeatureVector(review,stop_words_indo,stop_words_eng):
    feature_vector = []
    list_no = ['ga','engga','enggak','gak','nggak','ngga','tdk']
    for word in review:
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", word) #menghilangkan karakter selain huruf didalam kata
        if (word in stop_words_indo or val is None or word in stop_words_eng):
            continue
        else:
            if word in list_no:
                word = 'tidak'
            feature_vector.append(word)
    for_stemming = ' '.join(feature_vector)
    return feature_vector, for_stemming


In [10]:
def getNegativeHandling(review):
    negative_review = []
    for i in range(len(review)):
        word = review[i]
        if review[i-1] != 'tidak':
            negative_review.append(word)
        else:
            word = 'tidak_'+word
            negative_review.append(word)
    return negative_review

In [11]:
def preprocessReview(review):
    return getPunctHandling(getEmojiHandling(review))

In [12]:
def createFreqDict(reviewHandled):
    freqOfWord = {}
    for sentence in reviewHandled:
        for word in sentence:
            if word in freqOfWord:
                freqOfWord[word] += 1
            else:
                freqOfWord[word] = 1
    file_key = open('keys.txt','w')
    for key in freqOfWord.keys():
        file_key.write(str(key))
        file_key.write("\n")
    file_key.close()
    return freqOfWord

In [13]:
def getFeatureExtraction(review):
    words = set(review)
    features = {}
    for word in feature_list.keys():
        features['contains(%s)' % word] = (word in words) 
    return features

# BACA DATA

In [22]:
reviews_train, sentiments_train, reviews_test, sentiments_test = openFile('dataset.csv')
stop_words_indo = getStopWordsList('stopwordsindo.txt')
stop_words_eng = stopwords.words('english')

# TRAINING DATA

In [23]:
%%time
preprocess_reviews = []
tokens = []
reviews = []
handled_reviews = []
feature_list = []
for review in reviews_train:
    feature, review_for_stem = getFeatureVector(preprocessReview(review),stop_words_indo,stop_words_eng)
    preprocess_reviews.append(getStemmingSentence(review_for_stem))

Wall time: 9min 37s


In [24]:
%%time
for review in preprocess_reviews:
    tokens.append(nltk.word_tokenize(review))

Wall time: 83.8 ms


In [25]:
%%time
for i in range(len(tokens)):
    neg_handled_rev = getNegativeHandling(tokens[i])
    handled_reviews.append(neg_handled_rev)
    reviews.append((neg_handled_rev,sentiments_train[i]))

Wall time: 4.02 ms


In [26]:
%%time
feature_list = createFreqDict(handled_reviews)
training_set = nltk.classify.util.apply_features(getFeatureExtraction,reviews)
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)

Wall time: 1.38 s


# VALIDASI DATA TESTING

In [27]:
%%time
prediction = []
validation_test = []
for review in reviews_test:
    feature_classification, review_test_for_stem = getFeatureVector(review,stop_words_indo,stop_words_eng)
    handled_reviews_test = getNegativeHandling(feature_classification)
    classify_result = NBClassifier.classify(getFeatureExtraction(handled_reviews_test))
    prediction.append((review,classify_result))
    validation_test.append(classify_result)

Wall time: 806 ms


In [28]:
for sentiment in prediction:
    print(sentiment)

(['ini', 'face', 'wash', 'yang', 'awalnya', 'aku', 'ga', 'suka,', 'tapi', 'sekarang', 'jadi', 'my', 'go-to', 'face', 'wash', 'sampe', 'repurchase', 'berulang', 'kali.', 'tiap', 'abis', 'cumuk', 'muka', 'rasanya', 'halus', 'tanpa', 'kerasa', 'kering', 'ketarik-tarik.', 'kalo', 'buat', 'ngurangin', 'komedo', 'kayaknya', 'ga', 'ngefek', 'sih', 'ya.', 'kulitku', 'emang', 'butuh', 'exfoliator', 'yang', 'lebih', 'strong', 'kalo', 'buat', 'komedo', 'mah.', 'dia', 'ga', 'ada', 'wanginya', 'juga.', 'ga', 'ngerti', 'sih', 'ini', 'hg', 'apa', 'engga', 'buat', 'aku,', 'tapi', 'kalo', 'lagi', 'ga', 'cocok', 'sama', 'face', 'wash', 'lain,', 'ujung-ujungnya', 'aku', 'balik', 'lagi', 'ke', 'hada', 'labo', 'tamagohada', 'ini.'], 'positive')
(['dari', 'semua', 'variant', 'hadalabo', 'ini', 'yang', 'paling', 'better.', 'gatau', 'kenapa', 'semua', 'orang', 'cocok', 'tapi', 'di', 'gue', 'akhirnya', 'malah', 'bikin', 'moist', 'parah', 'dan', 'rasanya', 'muka', 'kurang', 'bersih', 'aja', 'gitu,', 'untungnya'

# HITUNG AKURASI

In [29]:
num_true = 0
for k,val in enumerate(validation_test):
    if val==sentiments_test[k]: 
        num_true+=1
accuracy = (num_true/len(reviews_test))*100
accuracy

82.0