In [29]:
import re
import gensim
import nltk
import math
import pickle
from dataPreprocessing import getData

In [30]:
source_list = ['dataset/training_text.csv', 'dataset/Indonesian_Tweets.tsv']

def dataGetter(source_list):
    corpus = []
    for source in source_list:
        corpus = corpus + getTweetData(source)
    return corpus

# Wrapper Class for Word2Vec model to search similarity

In [31]:
class similarity:
    def __init__(self, source_list):
        corpus = dataGetter(source_list)
        self.model = self.getWord2Vec(corpus)
        self.aspects = ['makanan', 'pelayanan', 'harga', 'suasana']
        self.polarities = ['baik', 'buruk']
        
    def getWord2Vec(self, toFeed, dim=50):
        return gensim.models.Word2Vec(toFeed, min_count=1,  size=dim)
    
    def most_similar_aspect(self, word):
        most_similar = (0, "")
        for aspect in self.aspects:
            score = self.model.wv.similarity(word, aspect)
            if score > most_similar[0]:
                most_similar = (score, aspect)
        return most_similar
    
    def most_similar_polarities(self, word):
        most_similar = (0, "")
        for polarity in self.polarities:
            score = self.model.wv.similarity(word, polarity)
            if score > most_similar[0]:
                most_similar = (score, polarity)
        return most_similar
    
def save_model(filename, obj):
    with open(filename, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
def load_model(filename):
    with open(filename, 'rb') as handle:
        model = pickle.load(handle)
    return model
    

In [32]:
sim = None
try:
    sim = load_model('model.pickle')
except FileNotFoundError:
    sim = similarity(source_list)
    save_model('model.pickle', sim)

In [41]:
sim.most_similar_aspect('lokasi')

(0.5582087723073736, 'pelayanan')

In [48]:
sim.model.wv.similarity('lokasi', 'cozy')

0.027694934614891756

In [38]:
term_example = [('bakso', 'enak'), ('tahu', 'enek'), ('harga', 'mahal'), ('bon', 'murah'), ('lokasi', 'ujung'), ('pelayan', 'ramah')]

# Choose the best Polarity in every aspects

In [39]:
aspects_map = {'makanan':'FOOD', 'pelayanan':'SERVICE', 'harga':'PRICE', 'suasana':'AMBIANCE'}
polarities_map = {'baik':'POSITIVE', 'buruk': 'NEGATIVE'}

def most_polar(term_list):
    res = {'FOOD':{'POSITIVE':(0, 0), 'NEGATIVE':(0, 0)}, 'SERVICE':{'POSITIVE':(0, 0), 'NEGATIVE':(0, 0)}, 'PRICE':{'POSITIVE':(0, 0), 'NEGATIVE':(0, 0)}, 'AMBIANCE':{'POSITIVE':(0, 0), 'NEGATIVE':(0, 0)}}
    for term in term_list:
        _, aspect = sim.most_similar_aspect(term[0])
        score, polarity = sim.most_similar_polarities(term[1])
        aspect = aspects_map[aspect]
        polarity = polarities_map[polarity]
        score = score + res[aspect][polarity][0]
        count = res[aspect][polarity][1] + 1
        res[aspect][polarity] = (score, count)
    return res

def determine_polarity(polar_list):
    for aspect in polar_list:
        for polarity in polar_list[aspect]:
            if polar_list[aspect][polarity][0] != 0:
                mean = polar_list[aspect][polarity][0] / polar_list[aspect][polarity][1]
            else:
                mean = 0
            polar_list[aspect][polarity] = mean
        if polar_list[aspect]['POSITIVE'] > polar_list[aspect]['NEGATIVE']:
            polar_list[aspect] = 'POSITIVE'
        elif polar_list[aspect]['POSITIVE'] < polar_list[aspect]['NEGATIVE']:
            polar_list[aspect] = 'NEGATIVE'
        else:
            polar_list[aspect] = None
    return polar_list

In [40]:
lists = most_polar(term_example)
determine_polarity(lists)

{'AMBIANCE': None,
 'FOOD': 'POSITIVE',
 'PRICE': 'NEGATIVE',
 'SERVICE': 'POSITIVE'}