In [5]:
import pandas as pd
import numpy as np

from polyglot.text import Text

In [6]:
FOOD = ['food', 'menu', 'nasi', 'bebek', 'daging', 'sambel', 'sate', 'teh', 'semur', 'iga', 'sapi', 'dessert', 'desserts', 'hamburger', 'alfredo', 'carbonara', 'waffle', 'kelapa', 'coco', 'ramen', 'broth', 'makanan', 'roti', 'sosis', 'potato', 'sauce', 'bbq', 'pizza', 'mie', 'beef', 'cake', 'chicken', 'bingsu', 'ice', 'bean', 'seafood', 'smoothie', 'coffee', 'snack', 'snacks', 'escargot', 'macaron', 'profiteroles', 'almond', 'chocolate', 'coconut', 'mango', 'tea', 'roll', 'salmon', 'teriyaki', 'onigiri', 'tuna', 'mochi', 'chip', 'soto', 'cuisine', 'ayam', 'dish', 'dishes', 'dimsum', 'brunch', 'bumbu', 'adonan', 'topping', 'martabak', 'keju', 'ovomaltine', 'keju', 'oreo', 'cream', 'cheese', 'kitkat', 'udang', 'mayo', 'lumpia', 'tahu', 'dumpling', 'hakau', 'goreng', 'kulit', 'cumi', 'hainam', 'bakso', 'kuah', 'dori', 'rasa', 'crab']
SERVICE = ['service', 'request', 'servis', 'pelayanan', 'pelayanannya', 'waiter', 'waiters', 'staff', 'serving', 'penyajian', 'penyajiannya', 'mas', 'portion', 'experience', 'operational']
PRICE =  ['price', 'harga', 'harganya', 'diskon']
AMBIANCE = ['place', 'suasana', 'suasananya', 'tempat', 'tempatnya', 'kios', 'kiosnya', 'ambiance', 'interior', 'interiornya', 'cafe', 'parkiran', 'here', 'sini', 'kesini', 'outlet', 'vibe', 'orang', 'penataan', 'penataannya', 'meja', 'kursi', 'dekorasi']
NEGATE = ['not', 'tidak', 'jangan', 'no', 'nggak', 'gk', 'gak', 'kurang']

In [None]:
#Load Data
data = pd.read_csv('dataset/preProcessSentiment.csv', index_col=0)
data = data[['text', 'FOOD', 'PRICE', 'SERVICE', 'AMBIANCE']]
for label in ['FOOD', 'PRICE', 'SERVICE', 'AMBIANCE']:
    data[label] = data[label].astype('category')
data.head()

In [7]:
def posTaggerFeature(sentence):
    err = False
    try :
        lstPosTag = sentence.pos_tags
    except:
        err = True
    def check_polarity(idx, length=3):
        before = lstPosTag[max(0,idx-length):idx]
        after = lstPosTag[idx+1:min(idx+length+1, len(lstPosTag))]
        polarity = 0
        around = before + after
        aroundText = ''
        negate = 1
        for word, _ in around:
            if word in NEGATE:
                negate *= -1
            aroundText += word + ' '
        try:
            return Text(aroundText).polarity*negate
        except:
            return 0
            
#         for word, posTag in before:
#             if posTag == 'PUNCT':
#                 break
#             if posTag == 'ADJ':
#                 polarity = Text(word).polarity
#                 break
#         for word, _ in before:
#             if word in NEGATE:
#                 polarity *= -1
#         after_negate = 1
#         for word, _ in after:
#             if word in NEGATE:
#                 after_negate = -1
#         for i in after:
#             if posTag == 'PUNCT':
#                 break
#             if posTag == 'ADJ':
#                 polarity += Text(word).polarity * after_negate
#                 break
        return polarity
    features = {
        'n_adj' : 0,
        'food_word_occ' : 0,
        'food_polarity' : 0,
        'service_word_occ' : 0,
        'service_polarity' : 0,
        'price_word_occ' : 0,
        'price_polarity' : 0,
        'ambiance_word_occ' : 0,
        'ambiance_polarity' : 0
    }
    if err:
        return features
    for i in range(len(lstPosTag)):
        word, posTag = lstPosTag[i]
        if posTag == 'ADJ':
            features['n_adj'] += 1
        elif posTag == 'NOUN' or posTag == 'VERB':
                if word in FOOD:
                    features['food_word_occ'] += 1
                    features['food_polarity'] += check_polarity(i)
                elif word in SERVICE:
                    features['service_word_occ'] += 1
                    features['service_polarity'] += check_polarity(i)
                elif word in PRICE:
                    features['price_word_occ'] += 1
                    features['price_polarity'] += check_polarity(i)
                elif word in AMBIANCE:
                    features['ambiance_word_occ'] += 1
                    features['ambiance_polarity'] += check_polarity(i)
    return features

def languageFeature(sentence):
    result =  sentence.language.code, sentence.language.confidence
    return {'language' : result[0], 'language_confidence' : result[1]}

def sentimentFeature(sentence):
    polarity = 0
    try:
        polarity = sentence.polarity
    except:
        pass
    
    features = {
        'text_polarity' : polarity
    }
    return features

In [None]:
tokenization = lambda document : [Text(sentences) for sentences in document]
tokenize = tokenization(data['text'])
features = []
for text in tokenize:
    features.append({**languageFeature(text), **posTaggerFeature(text), **sentimentFeature(text)})
features

In [8]:
data = pd.read_csv('dataset/preprocessTest.csv', index_col=0)
data.head()

Unnamed: 0,text
0,Bakmie jurangmangu ini penyelamat anak kos dar...
1,"It was ALL GOOD. The food, the interior, the p..."
10,"Kesini pas jam 8, service nya kurang, pelayana..."
100,Chinese food yg wkt itu yg bayar lumayan oke.....
1000,Pertama kali makan disini. Setelah sering seka...


In [9]:
tokenization = lambda document : [Text(sentences) for sentences in document]
tokenize = tokenization(data['text'])
features = []
for text in tokenize:
    features.append({**languageFeature(text), **posTaggerFeature(text), **sentimentFeature(text)})

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

In [17]:
def processFeature(feature):
    labels = {
        'FOOD' : 'UNKNOWN',
        'SERVICE' : 'UNKNOWN',
        'PRICE' : 'UNKNOWN',
        'AMBIANCE' : 'UNKNOWN'
    }
    
    #food
    if (feature['food_word_occ'] > 0):
        if (feature['food_polarity'] > 0):
            labels['FOOD'] = 'POSITIVE'
        elif (feature['food_polarity'] < 0):
            labels['FOOD'] = 'NEGATIVE'
        else :
            labels['FOOD'] = 'POSITIVE'
    #service
    if (feature['service_word_occ'] > 0):
        if (feature['service_polarity'] > 0):
            labels['SERVICE'] = 'POSITIVE'
        elif (feature['service_polarity'] < 0):
            labels['SERVICE'] = 'NEGATIVE'
        else:
            labels['SERVICE'] = 'POSITIVE'
    
    #price
    if (feature['price_word_occ'] > 0):
        if (feature['price_polarity'] > 0):
            labels['PRICE'] = 'POSITIVE'
        elif (feature['price_polarity'] < 0):
            labels['PRICE'] = 'NEGATIVE'
        else:
            labels['PRICE'] = 'POSITIVE'
    
    #ambiance
    if (feature['ambiance_word_occ'] > 0):
        if (feature['ambiance_polarity'] > 0):
            labels['AMBIANCE'] = 'POSITIVE'
        elif (feature['ambiance_polarity'] < 0):
            labels['AMBIANCE'] = 'NEGATIVE'
        else:
            labels['AMBIANCE'] = 'POSITIVE'
    
    return labels


In [21]:
labels = [processFeature(feature) for feature in features]

Unnamed: 0,text,AMBIANCE,FOOD,PRICE,SERVICE
0,Bakmie jurangmangu ini penyelamat anak kos dar...,,,,
1,"It was ALL GOOD. The food, the interior, the p...",,,,
10,"Kesini pas jam 8, service nya kurang, pelayana...",,,,
100,Chinese food yg wkt itu yg bayar lumayan oke.....,,,,
1000,Pertama kali makan disini. Setelah sering seka...,,,,
1001,"Paling enak nih kesini,apalagi buat nge date,s...",,,,
1002,This is my favourite place when i want to drin...,,,,
1003,Niatnya mau makan ditempat lain cuma kalo lewa...,,,,
1004,"Klo lagi pingin sop duren, pasti datengnya ke ...",,,,
1005,First time coming here on Saturday afternoon. ...,,,,


In [10]:
import xml.etree.ElementTree as ET

In [34]:
corpus = ET.Element('corpus')
i = 0
for index, row in data.iterrows():
    review = ET.SubElement(corpus, 'review', {'rid' : str(index)})
    text = ET.SubElement(review, 'text')
    text.text = row['text']
    aspects = ET.SubElement(review, 'aspects')
    test = False
    for aspect in ['FOOD', 'SERVICE', 'PRICE', 'AMBIANCE']:
        if labels[i][aspect] != 'UNKNOWN':
            test = True
            aspectelem = ET.SubElement(aspects, 'aspect', {
                'category' : aspect,
                'polarity' : labels[i][aspect]
            })
    if not test:
        aspectelem = ET.SubElement(aspects, 'aspect', {
                'category' : 'FOOD',
                'polarity' : 'POSITIVE'
            })
    i += 1

In [35]:
xmlFile = ET.ElementTree(corpus)
xmlFile.write(open('res.xml', 'wb'))