In [39]:
import re
from nltk import Tree
from pycorenlp import StanfordCoreNLP
import json
import nltk
from nltk.sem.logic import *
import requests
from nltk.corpus import sentiwordnet as swn
nltk.download('sentiwordnet')


read_expr = nltk.sem.Expression.fromstring
nlp = StanfordCoreNLP('http://localhost:9000')
dependency_parser = nlp.annotate

[nltk_data] Downloading package sentiwordnet to C:\Users\Farza
[nltk_data]     Nurifan\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


Babelfy WSD method

In [40]:
def babelfy(sentence):
    response = []
    token_word = nltk.word_tokenize(sentence)
    url = 'https://babelfy.io/v1/disambiguate?text='+sentence+'&annRes=WN&lang=en&key=76787a80-1771-41d6-8879-2e5064008923'
    r = requests.get(url)
    
    res = r.json()
    
    for x in res:
        tokenFragment = x['tokenFragment']
        startTknFragment = tokenFragment['start']
        endTknFragment = tokenFragment['end']
        babelSynsetID = x['babelSynsetID'];
        response.append((token_word[startTknFragment], babel_info(babelSynsetID)))
    
    return response

def babel_info(synset_id):
    url = 'https://babelnet.io/v5/getSynset?id='+synset_id+'&key=76787a80-1771-41d6-8879-2e5064008923'
    r = requests.get(url)
    res = r.json()
    return '.'.join(res['mainSense'].split('#'))
    

def get_score(sentence):
    resp = babelfy(sentence)
    ss = []
    for (w, sy) in resp:
        swn_senti = swn.senti_synset(sy)
        ss.append((w, sy, swn_senti.pos_score(), swn_senti.neg_score() ))
        
    return ss

In [41]:
intensifier_adverb = ['absolutely', 'completely', 'extremely', 'highly', 'rather', 'really', 'very', 'so', 'too', 'totally', 'utterly', 'at all']
negate_adverb = ['no', 'not', 'never', 'none', 'nobody']

In [122]:
def parser(expression):
    old = expression.replace(' ', '_').replace('>_(', '> (').replace(')_(', ') (').replace(')_)', ') )').replace(')_)', ') )').replace(' (', '(')
    new = ''
    flag = False
    for x in range(0, len(old) - 1):        
        if old[x] == '<':
            flag = True
        if old[x] == '>':
            flag = False
            
        if flag == True:
            if old[x] == '(':
                new += '{'
            elif old[x] == ')':
                new += '}'
            else:
                new += old[x]
        else:
            new += old[x]
    new += old[len(old)-1]
    return new

def pos_tag(sentence):
    result = dependency_parser(sentence, properties={"outputFormat": "json", "annotators": "pos"})['sentences'][0]['tokens']
    res = []
    for pos in result:
        res.append(pos['pos'])
    return res


def insert_pos_tag(exp, pos, nltk_pos):
    count = 0
    res = ''
    for x in range(0, len(exp)):
        if exp[x] == 'S' and exp[x+1]==' ' and exp[x+2] == 'P':
            res += 'S '
            res += pos[count] + ' ' + nltk_pos[count][1]
            count += 1
            x += 4
        else:
            res += exp[x]  
    return res


def direction(exp):
    cont = False
    for x in exp:
        if x == '{':
            cont = True
        elif x == '}':
            cont = False
            continue
        if cont == True:
            continue
        if x == '/':
            return '/'
        elif x == '\\':
            return '\\'
    return False


def is_type_raising(tree):
    tree_string = str(tree)
    
    # check type raising
    exp = tree_string.split('_')[1]
    pattern_1 = r'(.*?)\\(.*?){(.*?)/(.*?)}'
    pattern_2 = r'(.*?)/(.*?){(.*?)\\(.*?)}'
    
    match = False
    if re.search(pattern_1, exp):
        match = True
    elif re.search(pattern_2, exp):
        match = True
        
    sub = []
    for subtree in tree:
        sub.append(subtree)
    if len(sub) == 1 and match:
        return True
    else:
        return False

def map_wnpos_to_pennpos(pos):
    if(pos == 'n'):
        return 'NN'
    elif(pos == 'a'):
        return 'JJ'
    elif(pos == 'v'):
        return 'VB'
    elif(pos == 'r'):
        return 'RB'
    
def find_word_in_swn(swn_score, word):
    for (wordd, synset, pos_score, neg_score) in swn_score:
        if(wordd == word):
            return (wordd, synset, pos_score, neg_score)
        
def polarity_with_score(pos, neg):
    if(pos > neg):
        return ('P', round(pos * 10))
    elif(pos == neg):
        return ('Ne', round(neg*10))
    else:
        return ('N', 0)
    
def adverb_type(word):
    if (word in intensifier_adverb):
        return 'I'
    elif(word in negate_adverb):
        return 'Ne'
    else:
        return 'N'
    
def pos_majority_voting(corenlp, nltk, babelfy):
    pos = {}
    
    if(corenlp in pos):
        pos[corenlp] += 1
    else:
        pos[corenlp] = 0
        
    
    if(nltk in pos):
        pos[nltk] += 1
    else:
        pos[nltk] = 0
    
    
    if(babelfy in pos):
        pos[babelfy] += 1
    else:
        pos[babelfy] = 0
        
        
    #find biggeest counter in pos 
    return sorted(pos.items(), key=lambda x: x[1], reverse=True)[0][0];

    
def lambda_calculus(tree, swn_score):
    tree_string = str(tree)
    
    if tree_string[2] == 'L':
        corenlp_pos = tree_string.split('_')[3]
        nltk_pos = tree_string.split('_')[4]
        word = tree_string.split('_')[6]
        #                        #
        # masukin rulenya disini #
        #                        #
        r_word = ['PRP', 'FW', 'NN', 'LS']
        word_swn_score = find_word_in_swn(swn_score, word)
        pos_score = 0
        neg_score = 0
        
        babelfy_pos = None;
        
        if(word_swn_score):
            word, synset, pos_score, neg_score = word_swn_score
            splitted_syns = synset.split('.');
            babelfy_pos = map_wnpos_to_pennpos(splitted_syns[1]);
            
        
        polarity, score = polarity_with_score(pos_score, neg_score)
        
        pos = pos_majority_voting(corenlp_pos, nltk_pos, babelfy_pos)
            
        if pos == 'CC':
            return read_expr(r'CC')
        elif pos in r_word:
            return read_expr(word + '_' + pos + '_' + polarity + '_' + str(score))
        elif 'JJ' in pos:
            return read_expr(word + '_' + 'JJ' + '_' + polarity + '_' + str(score))
        elif 'VB' in pos:
            if '{S[dcl]\\NP}/{S[adj]\\NP}' in tree_string.split('_')[1]:
                return read_expr(r'\x.x')
            else:
                return read_expr(r'\X.' + word + '_' + 'VB' + '_' + polarity + '_' + str(score) +'(X)')
        elif 'RB' in pos:
            #Adverb has three types. I: Intensifier, Ne: Negation, N: no affection toward sentiment
            return read_expr(word + '_' + 'RB' + '_' + adverb_type(word))
        elif word == 'of':
            return read_expr('of')
        else:
#             print(word, tree_string, '--')
            return read_expr(r'\x.x')
    
    # ini cuma masukin ke array 
    chunk = []
    chunk3 = []
    sub = []
    
    for subtree in tree:
        if type(subtree) == nltk.tree.Tree:
            sub.append(subtree)
            
            # chunk temp array
            subtree_str_array = str(subtree).split('_')
            if subtree_str_array[0][2] == 'L':
                if subtree_str_array[3] == 'NN':
                    chunk.append(subtree_str_array[6])
                    if len(chunk3) == 0:
                        chunk3.append(subtree_str_array[6])
                    
            if len(chunk3) == 1:
                for sub in subtree:
                    # chunk temp array
                    subtree_str_array3 = str(sub).split('_')
                    if subtree_str_array3[0][2] == 'L':
                        if subtree_str_array3[3] == 'NN':
                            chunk3.append(subtree_str_array3[6])
    
    
    # chunk noun phrase
    if len(chunk) == 2:
        chunk_str = '+'.join(chunk)
        return read_expr(r'(' + chunk_str + '_NN_Ne_0)')
    
    if len(chunk3) == 3:
        chunk_str = '+'.join(chunk3)
        return read_expr(r'(' + chunk_str + '_NN_Ne_0)')
    
    # error anak 1
    if len(sub) == 1:            
        return lambda_calculus(sub[0], swn_score)
                        
    # urutan operasi lambda calculusnya    
    first = sub[0]
    second = sub[1]
    
    if is_type_raising(first):
        if direction(str(first).split('_')[1]) == '/':
            x = read_expr(r'\F x.F(x, ' + str(lambda_calculus(first, swn_score)) + ')')
            y = lambda_calculus(second, swn_score)
            return ApplicationExpression(x, y).simplify()
    
    if is_type_raising(second):
        if direction(str(second).split('_')[1]) == '/':
            x = lambda_calculus(first, swn_score)
            y = read_expr(r'\F x.F(x, ' + str(lambda_calculus(second, swn_score)) + ')')
            return ApplicationExpression(x, y).simplify()
    
        
    length_1 = len(str(sub[0]).split('_')[1].replace('\\', '/').split('/'))
    length_2 = len(str(sub[1]).split('_')[1].replace('\\', '/').split('/'))
    if length_2 > length_1:
        first = sub[1]
        second = sub[0]
    # rekursi
    return deduction(lambda_calculus(first, swn_score), lambda_calculus(second, swn_score))

In [124]:
def deduction(a, b):
    str_a = str(a)
    str_b = str(b)
    print('old a ' + str_a)
    print('old b ' + str_b)
    
    if str_a == 'of':
        a = read_expr(r'\x.x')
        b = read_expr(r'\x.x')
    
    #change identity function for adjective
    is_adj_exist_in_a = re.search(r'JJ.*', str_a)
    is_adj_exist_in_b = re.search(r'JJ.*', str_b)
    
    is_adverb_exist_in_a = re.search(r'RB.*', str_a)
    is_adverb_exist_in_b = re.search(r'RB.*', str_b)
    
    
    is_noun_exist_in_b = re.search(r'\w*?\+?\w*?\+?\w*_NN_\w*', str_b)
    is_verb_exist_in_b = re.search(r'VB_\w*_\d*', str_b)
    
    
    if is_adverb_exist_in_a and (is_noun_exist_in_b and (not is_verb_exist_in_b)):
        print(is_adverb_exist_in_a)
        print(is_noun_exist_in_b)
        print(is_verb_exist_in_b)
        a = read_expr(r'\x.x')
        b = read_expr(r'\x.x')

#     elif( is_adj_exist_in_a and is_noun_exist_in_b):
# #         adjective_score = str_a.split('_')[3]
# #         sentiment_polarity = str_a.split('_')[2]
#         pattern = '(\w*_JJ_\w_\d)'
#         adjective_score = re.findall(pattern, str_a)[0].split('_')[3]
#         sentiment_polarity = re.findall(pattern, str_a)[0].split('_')[2]
#         print(adjective_score, sentiment_polarity)
#         #get noun
#         noun_str_b = str_b[ is_noun_exist_in_b.start() : is_noun_exist_in_b.end() ]
#         #change sentiment and polarity
#         def mapFunction(data):
#             idx, x = data
#             if(idx == 2):
#                 return sentiment_polarity
#             elif(idx == 3):
#                 return adjective_score
#             else:
#                 return x
            
#         noun_update_str_b = '_'.join( list(map(mapFunction, enumerate(noun_str_b.split('_')))) )
#         #change str_b for noun filtered with x
#         list_str_b = list(str_b)
#         list_str_b[is_noun_exist_in_b.start() : is_noun_exist_in_b.end()] = 'x'
#         str_b = "".join(list_str_b)
#         print(str_b, noun_update_str_b, is_noun_exist_in_b)
#         str_b = str(ApplicationExpression(read_expr(r"\x." + str_b), read_expr(noun_update_str_b)).simplify())
        
#         a = read_expr(r'\x.x')
#         b = read_expr(str_b)
    
    #change identity function for adverb.
    elif( is_adverb_exist_in_a and is_adj_exist_in_b or is_adverb_exist_in_a and is_verb_exist_in_b ):
        #adverb modify adjective
        #example very_RB_I excellent_JJ_P_10 = excellent_JJ_P_20
        #I Intensifier must *2
        #N Negate must *-1
        adverb_type = str_a.split('_')[2]
        if(adverb_type == 'N'):
            a = read_expr(r'\x.x')
        elif(adverb_type == 'Ne'):
            #for negation adverb just change polairty
            def mapFunction(data):
                idx, x = data
                if(idx == 2):
                    if(x == 'P'):
                        return 'Ne'
                    elif(x == 'Ne'):
                        return 'P'
                    else:
                        return x
                else:
                    return x

            str_b = '_'.join( list(map(mapFunction, enumerate(str_b.split('_')))) )
            a = read_expr(r'\x.x')
            b = read_expr(str_b)
            
        elif(adverb_type == 'I'):
            #for intensifier adverb. scale adjective value 
            vb_score = re.findall('VB_\w*_\d*', str_b)[0].split('_')[2]
            def mapFunction(data):
                if(data == vb_score):
                    return str(int(vb_score) * 2);
                else:
                    return data;
            list_str_b = list(str_b)
            str_b = ''.join( list(map(mapFunction, list_str_b)))
            print(str_b)
            a = read_expr(r'\x.x')
            b = read_expr(str_b)
        
    elif( is_adverb_exist_in_a and is_adverb_exist_in_b):
        #adverb modify other adverb
        #kondisi yang jarang bertemu
        pass
    
    if(str_a == 'CC'):
#         print('m')
        pass
    if(str_b == 'CC'):
        a = read_expr(str_b)
        b = read_expr(str_a)
    if (re.search(r'CC\(', str_a)) and (re.search(r',', str_a)) and is_noun_exist_in_b:
        a = read_expr(r'\x.x')
        b = read_expr(replacer(str_a, str_b))
        
    r_word = ['PRP', 'FW', 'NN', 'LS', 'JJ']
    
    if '(' not in str_a and '(' not in str_b:
            x = False
            y = False
            for r in r_word:
                if r in str_a:
                    x = True
                if r in str_b:
                    y = True
                    
            if x and y:
                a = read_expr(r'\x.x')
                b = read_expr('seq(' + str_a + ',' + str_b + ')')
                
    str_a = str(a)
    str_b = str(b)
    print('new a ' + str_a)
    print('new b ' + str_b)
    print('hasil ' + str(ApplicationExpression(a, b).simplify()))
    return ApplicationExpression(a, b).simplify()


In [119]:
def replacer(a, b):
#     print(a, b)
    pattern = '(\w*_NN_)'
    mereplace = re.findall(pattern, b)
    res = ''
    for mreplace in mereplace:
        pattern = '(\w*_JJ_)'
        hasil = re.findall(pattern, a)
        if hasil:
            res = a.replace(hasil[0], mreplace)
        if len(hasil) > 1:
            for index in range(1, len(hasil)):
                res = res.replace(hasil[index], mreplace)
    return res

In [112]:
def glue_process(sent):
    url = "http://localhost:5000/ccgParsing"
    data = {"sent": sent}
    r = requests.post(url, data=data)

    res = r.json()
    
    from_res = res['tree']
    
    text = nltk.word_tokenize(sent)
    nltk_pos = nltk.pos_tag(text)
    
    pos_tagged = insert_pos_tag(from_res, pos_tag(data['sent']), nltk_pos)

    hasil = parser(pos_tagged)

    tree = Tree.fromstring(hasil)
    swn_score = get_score(sent)
    
#     print(tree)
    return lambda_calculus(tree, swn_score)

In [113]:
# glue_process('the breakfast that the restaurant served daily was excellent')

In [114]:
# glue_process('i bought my canon g3 about a month ago and i have to say i am very satisfied')

In [115]:
glue_process("The bowl of squid eyeball stew is hot and delicious")

old a delicious_JJ_P_8
old b CC
new a CC
new b delicious_JJ_P_8
hasil CC(delicious_JJ_P_8)
old a CC(delicious_JJ_P_8)
old b hot_JJ_Ne_0
new a CC(delicious_JJ_P_8)
new b hot_JJ_Ne_0
hasil CC(delicious_JJ_P_8,hot_JJ_Ne_0)
old a \x.x
old b CC(delicious_JJ_P_8,hot_JJ_Ne_0)
new a \x.x
new b CC(delicious_JJ_P_8,hot_JJ_Ne_0)
hasil CC(delicious_JJ_P_8,hot_JJ_Ne_0)
old a of
old b squid+eyeball+stew_NN_N_0
new a \x.x
new b \x.x
hasil \x.x
old a \x.x
old b bowl_NN_Ne_0
new a \x.x
new b bowl_NN_Ne_0
hasil bowl_NN_Ne_0
old a \x.x
old b bowl_NN_Ne_0
new a \x.x
new b bowl_NN_Ne_0
hasil bowl_NN_Ne_0
old a CC(delicious_JJ_P_8,hot_JJ_Ne_0)
old b bowl_NN_Ne_0
CC(delicious_JJ_P_8,hot_JJ_Ne_0) bowl_NN_Ne_0 -- ['bowl_NN_']
new a \x.x
new b CC(bowl_NN_P_8,bowl_NN_Ne_0)
hasil CC(bowl_NN_P_8,bowl_NN_Ne_0)


<ApplicationExpression CC(bowl_NN_P_8,bowl_NN_Ne_0)>

In [116]:
pattern = '(\w*?\+?\w*_NN_[P|N]_\d)'
re.findall(pattern, 'CC(bowl_NN_P_8,bowl_NN_N_0)')

['bowl_NN_P_8', 'bowl_NN_N_0']

In [117]:
import pandas as pd


def semua(collection):
    df = pd.DataFrame(columns=['sentence', 'lambda', 'raw_aspect', 'aspect', 'sentiment'])
    for data in collection:
        hasil = glue_process(data)
        
        pattern = '(\w*?\+?\w*_NN_[P|N]_\d)'
        aspek = re.findall(pattern, str(hasil))
        aspect = []
        sentiment = []
        temp = ''
        for asp in aspek:
            temp = asp.split('_')
            aspect.append(temp[0].replace('+', ' '))
            sentiment.append(1 if temp[2] == 'P' else 0)
            
        df = df.append({'sentence': data, 'lambda': hasil,'raw_aspect': aspek, 'aspect': aspect, 'sentiment': sentiment}, ignore_index=True)
    df.to_csv('hasil_ccg.csv')
    return df

In [125]:
# input file
df = pd.read_csv("dataset.csv")
# preprocess
sentences = df['review']
labels = df["target"]

semua(sentences[0:10])

old a \x.x
old b purchase_NN_Ne_0
new a \x.x
new b purchase_NN_Ne_0
hasil purchase_NN_Ne_0
old a \x.x
old b purchase_NN_Ne_0
new a \x.x
new b purchase_NN_Ne_0
hasil purchase_NN_Ne_0
old a \X.satisfied_VB_P_5(X)
old b purchase_NN_Ne_0
new a \X.satisfied_VB_P_5(X)
new b purchase_NN_Ne_0
hasil satisfied_VB_P_5(purchase_NN_Ne_0)
old a extremely_RB_I
old b satisfied_VB_P_5(purchase_NN_Ne_0)
satisfied_VB_P_10(purchase_NN_Ne_0)
new a \x.x
new b satisfied_VB_P_10(purchase_NN_Ne_0)
hasil satisfied_VB_P_10(purchase_NN_Ne_0)
old a \x.x
old b satisfied_VB_P_10(purchase_NN_Ne_0)
new a \x.x
new b satisfied_VB_P_10(purchase_NN_Ne_0)
hasil satisfied_VB_P_10(purchase_NN_Ne_0)
old a satisfied_VB_P_10(purchase_NN_Ne_0)
old b CC
new a CC
new b satisfied_VB_P_10(purchase_NN_Ne_0)
hasil CC(satisfied_VB_P_10(purchase_NN_Ne_0))
old a \x.x
old b canon+powershot+g3_NN_Ne_0
new a \x.x
new b canon+powershot+g3_NN_Ne_0
hasil canon+powershot+g3_NN_Ne_0
old a \X.purchased_VB_Ne_0(X)
old b canon+powershot+g3_NN_Ne_0


old a \x.x
old b camera_NN_Ne_0
new a \x.x
new b camera_NN_Ne_0
hasil camera_NN_Ne_0
old a \x.x
old b camera_NN_Ne_0
new a \x.x
new b camera_NN_Ne_0
hasil camera_NN_Ne_0
old a \x.x
old b picture_NN_Ne_0
new a \x.x
new b picture_NN_Ne_0
hasil picture_NN_Ne_0
old a \X.took_VB_Ne_0(X)
old b picture_NN_Ne_0
new a \X.took_VB_Ne_0(X)
new b picture_NN_Ne_0
hasil took_VB_Ne_0(picture_NN_Ne_0)
old a camera_NN_Ne_0
old b took_VB_Ne_0(picture_NN_Ne_0)
new a camera_NN_Ne_0
new b took_VB_Ne_0(picture_NN_Ne_0)
hasil camera_NN_Ne_0(took_VB_Ne_0(picture_NN_Ne_0))
old a camera_NN_Ne_0(took_VB_Ne_0(picture_NN_Ne_0))
old b i_FW_Ne_0
new a camera_NN_Ne_0(took_VB_Ne_0(picture_NN_Ne_0))
new b i_FW_Ne_0
hasil camera_NN_Ne_0(took_VB_Ne_0(picture_NN_Ne_0),i_FW_Ne_0)
old a \x.x
old b camera_NN_Ne_0(took_VB_Ne_0(picture_NN_Ne_0),i_FW_Ne_0)
new a \x.x
new b camera_NN_Ne_0(took_VB_Ne_0(picture_NN_Ne_0),i_FW_Ne_0)
hasil camera_NN_Ne_0(took_VB_Ne_0(picture_NN_Ne_0),i_FW_Ne_0)
old a of
old b us_PRP_Ne_0
new a \x.x
ne

LogicalExpressionException: End of input found.  Expression expected.

^

In [121]:
glue_process('i recently purchased the canon powershot g3 and am extremely satisfied with the purchase')

old a \x.x
old b purchase_NN_Ne_0
new a \x.x
new b purchase_NN_Ne_0
hasil purchase_NN_Ne_0
old a \x.x
old b purchase_NN_Ne_0
new a \x.x
new b purchase_NN_Ne_0
hasil purchase_NN_Ne_0
old a \X.satisfied_VB_P_5(X)
old b purchase_NN_Ne_0
new a \X.satisfied_VB_P_5(X)
new b purchase_NN_Ne_0
hasil satisfied_VB_P_5(purchase_NN_Ne_0)
old a extremely_RB_I
old b satisfied_VB_P_5(purchase_NN_Ne_0)
satisfied_VB_P_10(purchase_NN_Ne_0)
new a \x.x
new b satisfied_VB_P_10(purchase_NN_Ne_0)
hasil satisfied_VB_P_10(purchase_NN_Ne_0)
old a \x.x
old b satisfied_VB_P_10(purchase_NN_Ne_0)
new a \x.x
new b satisfied_VB_P_10(purchase_NN_Ne_0)
hasil satisfied_VB_P_10(purchase_NN_Ne_0)
old a satisfied_VB_P_10(purchase_NN_Ne_0)
old b CC
new a CC
new b satisfied_VB_P_10(purchase_NN_Ne_0)
hasil CC(satisfied_VB_P_10(purchase_NN_Ne_0))
old a \x.x
old b canon+powershot+g3_NN_N_0
new a \x.x
new b canon+powershot+g3_NN_N_0
hasil canon+powershot+g3_NN_N_0
old a \X.purchased_VB_Ne_0(X)
old b canon+powershot+g3_NN_N_0
new 

<ApplicationExpression CC(satisfied_VB_P_10(purchase_NN_Ne_0),purchased_VB_Ne_0(canon+powershot+g3_NN_N_0),i_LS_Ne_0)>

In [100]:
p = re.search(r'VB_\w*_\d*', 'purchased_VB_Ne_0(canon+powershot+g3_NN_N_0)')
print(p)

<_sre.SRE_Match object; span=(10, 17), match='VB_Ne_0'>
