In [27]:
import spacy
import pandas as pd
import numpy as np
import pickle
import string
from gensim.models import KeyedVectors
from scipy import spatial

glove = KeyedVectors.load_word2vec_format('glove.6B.300d.w2vformat.txt')
w2v = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
nlp = spacy.load("model_wiki")

def cosine(x, y):
    dataSetI = x
    dataSetII = y
    return 1 - spatial.distance.cosine(dataSetI, dataSetII)

def rem_punct(word):
    return word.translate(str.maketrans('', '', string.punctuation))

def load_category(category, tipe):
    pickle_in = open("../vector_keyword_aspek/"+ tipe + '/' + category +".pickle", "rb")
    return pickle.load(pickle_in)

def similarity(category, word):
    res = []
    for cat in category:
        res.append(cosine(cat, word))
    return np.mean(res)

def sim_result(word):
    c1 = similarity(ambience, word)
    c2 = similarity(food, word)
    c3 = similarity(service, word)
    c4 = similarity(value, word)
    return c1, c2, c3, c4

def cat_str(a, b, c, d):
    if a > b and a > c and a > d:
        return 1
    elif b > a and b > c and b > d:
        return 2
    elif c > b and c > a and c > d:
        return 3
    elif d > b and d > c and d > a:
        return 4
    else:
        return 5
    
def str_cat(num):
    pred =''
    if num == 1:
        pred = 'AMBIENCE'
    elif num == 2:
        pred = 'FOOD'
    elif num == 3:
        pred = 'SERVICE'
    elif num == 4:
        pred = 'PRICES'
    return pred

def run(input_file, output_file, sim):
    ambience = load_category('ambience', sim)
    food = load_category('food', sim)
    service = load_category('service', sim)
    value = load_category('price', sim)
    
    df = pd.read_csv(input_file)
    sf = pd.DataFrame(columns=['id', 'review', 'target', 'label', 'predict', 'term', 'polarity'])
    for index in range(len(df)):
        pred= ''
        pred_mult=[]
        if type(df['term'][index]) != float:
            terms = df['term'][index].lower().split('|')
            target = df['target'][index]
            id_file = df['id'][index]
            label = df['category'][index]
            review = df['review'][index]
            polarity = df['polarity'][index]

            tokenized = df['review'][index].lower().replace('  ', ' ').strip().split(' ')

            for term in terms:
                term_t = ''
                if term == '':
                    continue
                else:
                    term_t = term.split('!')[0]
                i = 0
                for token in tokenized:
                    if term_t in token:
                        if sim == 'fasttext':
                            a, b, c, d = sim_result(nlp(rem_punct(token)).vector)
                        elif sim == 'w2v':
                            try:
                                a, b, c, d = sim_result(w2v[rem_punct(token)])
                            except:
                                a, b, c, d = 0, 0, 0, 0
                        elif sim == 'glove':
                            try:
                                a, b, c, d = sim_result(glove[rem_punct(token)])
                            except:
                                a, b, c, d = 0, 0, 0, 0

                        x = cat_str(a, b, c, d)
                        pred_mult.append(str_cat(x))
                        break
                    i += 1
        else:
            terms = ''
            target = df['target'][index]
            id_file = df['id'][index]
            label = df['category'][index]
            polarity = df['polarity'][index]
            review = df['review'][index]
            
        sf = sf.append({'id': id_file,
                        'review': review,
                        'target': target,
                        'label': label,
                        'predict': '|'.join(pred_mult),
                        'term': '|'.join(terms),
                        'polarity': polarity
                       }, ignore_index=True)

        sf.to_csv(output_file)
        
run('input.csv', 'output-glove.csv', 'glove')
run('input.csv', 'output-w2v.csv', 'w2v')
run('input.csv', 'output-fasttext.csv', 'fasttext')