# <center> KLASIFIKASI SENTIMEN ULASAN PRODUK FEMALE DAILY </center>

## Import Library

In [1]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import csv
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix

## Baca Data 

Data berupa CSV

In [2]:
def open_file(file_name):
    with open(file_name, encoding="utf-8") as csvfile: 
        next(csvfile)
        rawArticles = csv.reader(csvfile, delimiter=',') 
        all_reviews = [] 
        all_sentiments = []
        for row in rawArticles:
            all_reviews.append((row[3].lower()).split())
            all_sentiments.append(row[5].lower())
    return all_reviews, all_sentiments

## Preprocessing

### 1. Stop Word Removal

In [3]:
def get_stopword(stopwordsfile):
    stopwords=[]
    file_stopwords = open(stopwordsfile,'r')
    row = file_stopwords.readline()
    while row:
        word = row.strip()
        stopwords.append(word)
        row = file_stopwords.readline()
    file_stopwords.close()
    return stopwords


In [7]:
def stopword_removal(review,stop_words_indo,stop_words_eng):
    feature_vector = []
    list_no = ['ga','engga','enggak','gak','nggak','ngga','tdk']
    for word in review:
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", word) #menghilangkan karakter selain huruf didalam kata
        if (word in stop_words_indo or val is None or word in stop_words_eng):
            continue
        else:
            if word in list_no:
                word = 'tidak'
            feature_vector.append(word)
    for_stemming = ' '.join(feature_vector)
    return feature_vector, for_stemming

### 2. Emoji, Punctuation, and Symbol Handling

In [4]:
def emoji_handling(review):
    emoji = []
    for word in review:
        #Smile -- :), : ), :-), (:, ( :, (-:, :')
        word = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))','POS',word)
        
        #Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
        word = re.sub(r'(:\s?D|:-D|x-?D|X-?D)','POS',word)

        # Sad -- :-(, : (, :(, ):, )-:
        word = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' NEG ', word)

        # Cry -- :,(, :'(, :"(, T_T
        word = re.sub(r'(:,\(|:\'\(|:"\(|T_T)', ' NEG ', word)

        emoji.append(word)
    return emoji

In [5]:
def punct_handling(review):
    #menghilangkan tanda baca
    preprocess_review = []
    for word in review:
        word = word.strip('\'"?!,.():;')

        #mengkonversi huruf vocal lebih dari satu dan berurutan
        word_character = re.compile(r"(.)\1+", re.DOTALL)
        word = word_character.sub(r"\1\1", word)

        #menghilangkan tanda - & '
        word = re.sub(r'(-|\')','',word)

        preprocess_review.append(word.lower())
    return preprocess_review

In [68]:
def preprocess_review(review):
    return punct_handling(emoji_handling(review))

### 3. Stemming

In [6]:
def stem_sentences(review):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return stemmer.stem(review)


### 4. Negative Handling

In [8]:
def negative_handling(review):
    negative_review = []
    for i in range(len(review)):
        word = review[i]
        if review[i-1] != 'tidak':
            negative_review.append(word)
        else:
            word = 'tidak_'+word
            negative_review.append(word)
    return negative_review


## Fitur Klasifikasi

In [58]:
def create_freqwords(reviewHandled):
    freqOfWord = {}
    for sentence in reviewHandled:
        for word in sentence:
            if word in freqOfWord:
                freqOfWord[word] += 1
            else:
                freqOfWord[word] = 1
    return freqOfWord

In [59]:
def get_featureextract(review):
    words = set(review)
    features = {}
    for word in feature_list.keys():
        features['contains(%s)' % word] = (word in words) 
    return features

## Klasifikasi Naive Bayes

### Import data

In [62]:
all_reviews, all_sentiments = open_file('dataset.csv')

In [64]:
stop_words_indo = get_stopword('stopwordsindo.txt')
stop_words_eng = stopwords.words('english')

### Preprocessing

In [71]:
%%time
preprocess_reviews = []
for review in all_reviews:
    feature, review_for_stem = stopword_removal(preprocess_review(review),stop_words_indo,stop_words_eng)
    preprocess_reviews.append(stem_sentences(review_for_stem))

Wall time: 12min 36s


In [100]:
%%time
tokens = []
for review in preprocess_reviews:
    tokens.append(nltk.word_tokenize(review))

Wall time: 104 ms


In [101]:
%%time
handled_reviews = []
reviews = []
for i in range(len(tokens)):
    neg_handled_rev = negative_handling(tokens[i])
    handled_reviews.append(neg_handled_rev)
    reviews.append((neg_handled_rev,all_sentiment[i]))

Wall time: 3.99 ms


In [102]:
n_reviews=len(all_reviews)
randomize = random.sample(range(n_reviews), n_reviews)
idx_train = randomize[:(int(n_reviews*0.8))]
idx_test = randomize[(int(n_reviews*0.8)):]
reviews_train = [reviews[idx] for idx in idx_train]
sentiment_train = [all_sentiment[idx] for idx in idx_train]
handled_review_train=[handled_reviews[idx] for idx in idx_train]
handled_reviews_test=[handled_reviews[idx] for idx in idx_test]
reviews_test = [all_reviews[idx] for idx in idx_test]
sentiment_test = [all_sentiment[idx] for idx in idx_test]

## Dengan Ekstraksi Fitur Contains Words

In [107]:
%%time
feature_list = []
feature_list = create_freqwords(handled_review_train)
training_set = nltk.classify.util.apply_features(get_featureextract,reviews_train)
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)

Wall time: 1.36 s


In [108]:
%%time
prediction = []
validation_test = []
for handled_reviews in handled_reviews_test:
    classify_result = NBClassifier.classify(get_featureextract(handled_reviews))
    prediction.append((handled_reviews,classify_result))
    validation_test.append(classify_result)

Wall time: 726 ms


In [109]:
num_true = 0
for k,val in enumerate(validation_test):
    if val==sentiment_test[k]: 
        num_true+=1
accuracy = (num_true/len(reviews_test))*100
accuracy

91.0

## Dengan TF-IDF

In [129]:
labels = np.zeros(500)
for i in idx_test:
    labels[i]=1
kf = StratifiedKFold(n_splits=10)
totalNB = 0
totalMatNB = np.zeros((2,2))


In [130]:
for train_idx, test_idx in kf.split(all_reviews,all_sentiments):
    X_train = [all_reviews[i] for i in train_idx]
    X_test = [all_reviews[i] for i in test_idx]
    y_train,y_test = labels[train_idx], labels[test_idx]
    vectorizer = TfidfVectorizer(min_df=0.0, max_df=1.0, sublinear_tf=True, use_idf=True, stop_words='english')
    X_train_tf_idf = vectorizer.fit_transform(X_train)
    X_test_tf_idf = vectorizer.transform(X_test)

    model = MultinomialNB()
    model.fit(X_train_tf_idf, y_train)
    result = model.predict(X_test_tf_idf)

    totalMatNB = totalMatNB + confusion_matrix(y_test, result)
    totalNB = totalNB + sum(y_test==result)
    print(train_idx,test_idx)

AttributeError: 'list' object has no attribute 'lower'