In [1]:
from pprint import pprint
from nltk.tag import hmm
from sklearn.externals import joblib
from nltk.tag.hmm  import HiddenMarkovModelTagger, HiddenMarkovModelTrainer
from nltk.probability import LidstoneProbDist
from nltk.tokenize import sent_tokenize

import nltk
import dill
import pickle
import os
import pandas as pd
import numpy as np
import sys

import re
import gensim
import nltk
import math
import pickle
from dataPreprocessing import getData

sys.path.append("lib/uuparser/barchybrid/src/")
import utils
from arc_hybrid import ArcHybridLSTM

In [2]:
source_list = ['dataset/training_text.csv', 'dataset/Indonesian_Tweets.tsv']

def dataGetter(source_list):
    corpus = []
    for source in source_list:
        corpus = corpus + getData(source)
    return corpus

In [3]:
class similarity:
    def __init__(self, source_list):
        corpus = dataGetter(source_list)
        self.model = self.getWord2Vec(corpus)
        self.aspects = ['makanan', 'pelayanan', 'harga', 'suasana']
        self.polarities = ['baik', 'buruk']
        
    def getWord2Vec(self, toFeed, dim=50):
        return gensim.models.Word2Vec(toFeed, min_count=1,  size=dim)
    
    def most_similar_aspect(self, word):
        most_similar = (0, "")
        for aspect in self.aspects:
            score = self.model.wv.similarity(word, aspect)
            if score > most_similar[0]:
                most_similar = (score, aspect)
        return most_similar
    
    def most_similar_polarities(self, word):
        most_similar = (0, "")
        for polarity in self.polarities:
            score = self.model.wv.similarity(word, polarity)
            if score > most_similar[0]:
                most_similar = (score, polarity)
        return most_similar
    
    def most_similar_word_with_aspect(self, sentence, aspect):
        most_similar = (0, "")
        for word in sentence.split():
            score = self.model.wv.similarity(word, aspect)
            if score > most_similar[0]:
                most_similar = (score, word)
        return most_similar
    
def save_model(filename, obj):
    with open(filename, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
def load_model(filename):
    with open(filename, 'rb') as handle:
        model = pickle.load(handle)
    return model
    

In [4]:
sim = None
try:
    sim = load_model('model.pickle')
except:
    sim = similarity(source_list)
    save_model('model.pickle', sim)

In [5]:
MODELS_DIR = 'models/id_gsd/'
TAGGER_FILE_NAME = 'tagger.dill'
PARAMS_FILE = MODELS_DIR + "params.pickle"
PARSER_FILE="barchybrid.model"

In [6]:
with open(MODELS_DIR + TAGGER_FILE_NAME, 'rb') as f:
    hmm_tagger = dill.load(f)

In [7]:
with open(PARAMS_FILE, 'r') as paramsfp:
    words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load(paramsfp)
    parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i,
                           ch, stored_opt)
    model = os.path.join(MODELS_DIR, PARSER_FILE)
    parser.Load(model)

Loading model from models/id_gsd/barchybrid.model


# PROCESS INDIVIDUAL REVIEW

contoh review:

sapi bakarnya enak banget harganya juga lumayan murah

In [190]:
FOOD_POSITIVE_ADJ = ['enak', 'banyak', 'lembut']
FOOD_NEGATIVE_ADJ = ['pahit']

PRICE_POSITIVE_ADJ = ['murah', 'banyak']
PRICE_NEGATIVE_ADJ = ['mahal', 'sedikit']

SERVICE_POSITIVE_ADJ = ['cepat', 'baik', 'bagus', 'ramah']
SERVICE_NEGATIVE_ADJ = ['kasar', 'lambat']

AMBIENCE_POSITIVE_ADJ = ['nyaman', 'adem']
AMBIENCE_NEGATIVE_ADJ = ['berantakan', 'panas']

MERGED = FOOD_POSITIVE_ADJ + FOOD_NEGATIVE_ADJ + PRICE_POSITIVE_ADJ + PRICE_NEGATIVE_ADJ + SERVICE_POSITIVE_ADJ + SERVICE_NEGATIVE_ADJ + AMBIENCE_POSITIVE_ADJ + AMBIENCE_NEGATIVE_ADJ

In [184]:
review = "Tadi gue first time lewat Beji pas lewat dpn cafe ini, gue lgsng trtarik buat dtg Karna baru bgt buka. Gue mesen beberapa makanan yaitu roti choco crunch , sossis and potato pke sauce bbq apa tau namanya lupa, sm pizza mie pake topping smoked beef. Gilak gue rasa utk servis msh hrs byk belajar deh, gak kyk td pelayannya gak peka. Masih kaku gt. Terus masa gue dah hampir 30menit lama gak ada satupun mknan yg dtg. Trs cwok gue manggil waittersnya nanya mknannya dah jd apa belum, dan mas2 nya blg ""maaf ya mas td kita bingung cari yg mesen"" buset deh trnyata bener meja sblah gue kyknya dah mulai bosen. Pelayannya pun terlihat pd masih kebingungan. So klo mnrt gue dri segi makanan sih lumayan, service dan prosedur pelayanannya sih yg mesti jd concern, kliatan bgt blm ready. Pdhal tempatnya dah keren, luas bgt. Interiornya keren. Smga bisa terus maju."

In [185]:
point1 = ["VBD", "VB", "VBG", "VBN","VBP", "VBZ", "JJ", "JJR", "JJS", "RB", "RBR", "RBS"]
point2 = ["JJ", "JJR", "JJS", "RB", "RBR", "RBS"]
verb = ["VBD", "VB", "VBG", "VBN","VBP", "VBZ"]
noun = ["NN", "NNS", "NNP", "NNPS", "Z"]
adverb =["RB", "RBR", "RBS"]
adjective = ["JJ", "JJR", "JJS"]

In [244]:
def examine_polarity(sentence):
    """
        Check negative positive word regarding food, price, service, and ambience aspect in particular sentence
    """
    polarities = {
        'makanan': 0,
        'harga': 0,
        'pelayanan': 0,
        'suasana': 0,      
    }
    
    tagged_sentence = ' '.join(['{0}/{1}'.format(word, tag) for word, tag in hmm_tagger.tag(review.split())])
    data = utils.read_conll_text(tagged_sentence)
    pred = parser.Predict(data)
    depedency_tree = list(pred)[0]
    
    for p in depedency_tree:
        if p.pos == 'JJ' or p.form in MERGED:
            try:
                pol = sim.most_similar_polarities(p.form)
                weight = -1 if (pol[1] == 'buruk' and pol[0] > 0.5) else 1
                parent_id = p.pred_parent_id
                offset = 3
                aspects = {
                        'makanan': 0,
                        'harga': 0,
                        'pelayanan': 0,
                        'suasana': 0
                    }
                while(parent_id != None and parent_id > 0 and offset > 0):
                    parent = depedency_tree[parent_id]
                    value, aspect = sim.most_similar_aspect(parent.form)
                    aspects[aspect] += value
                    parent_id = parent.pred_parent_id
                    offset -= 1
                s = sorted(aspects.items(), key=lambda x: x[1], reversed=True)
                avg_aspect = aspects.keys()[0]
                polarities[avg_aspect] += weight
            except:
                s = sorted(aspects.items(), key=lambda x: x[1], reverse=True)
                avg_aspect = s[0][0]
                polarities[avg_aspect] += weight
    return polarities['makanan'], polarities['harga'], polarities['pelayanan'], polarities['suasana'] 

In [245]:
def extract_review_polarities(review):
    polarities = {
        'makanan': 0,
        'harga': 0,
        'pelayanan': 0,
        'suasana': 0,      
    }
    sentences = sent_tokenize(review)
    for sentence in sentences:
        pol_food, pol_price, pol_service, pol_ambience = examine_polarity(sentence)
        polarities['makanan'] += pol_food
        polarities['harga'] += pol_price
        polarities['pelayanan'] += pol_service
        polarities['suasana'] += pol_ambience
        
    for key, value in polarities.items():
        if value > 0:
            polarities[key] = 'POSITIVE'
        elif value < 0:
            polarities[key] = 'NEGATIVE'
        else:
            del polarities[key]
    return polarities


In [247]:
review = "harga lumayan murah banget, Sengaja macet2an kesini cuman buat nyobain nasi goreng cakalang yang orang2 bilang enak. Dan emang beneran enak sih nasi gorengnya wkkw suasana nya juga enak buat makan ramai2 gitu."
extract_review_polarities(review)

Time: 0.2s
Time: 0.16s


{'harga': 'POSITIVE', 'makanan': 'POSITIVE', 'suasana': 'POSITIVE'}

In [169]:
sim = load_model('model.pickle')