In [1]:
import re
from nltk import Tree
from pycorenlp import StanfordCoreNLP
import json
import nltk
from nltk.sem.logic import *
import requests
from nltk.corpus import sentiwordnet as swn
nltk.download('sentiwordnet')


read_expr = nltk.sem.Expression.fromstring
nlp = StanfordCoreNLP('http://localhost:9000')
dependency_parser = nlp.annotate

[nltk_data] Downloading package sentiwordnet to C:\Users\Farza
[nltk_data]     Nurifan\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


Babelfy WSD method

In [19]:
def babelfy(sentence):
    response = []
    token_word = nltk.word_tokenize(sentence)
    url = 'https://babelfy.io/v1/disambiguate?text='+sentence+'&annRes=WN&lang=en&key=96f82706-02ea-4c2f-8e0c-f34f8778139d'
    r = requests.get(url)
    
    res = r.json()
    
    for x in res:
        tokenFragment = x['tokenFragment']
        startTknFragment = tokenFragment['start']
        endTknFragment = tokenFragment['end']
        babelSynsetID = x['babelSynsetID'];
        response.append((token_word[startTknFragment], babel_info(babelSynsetID)))
    
    return response

def babel_info(synset_id):
    url = 'https://babelnet.io/v5/getSynset?id='+synset_id+'&key=96f82706-02ea-4c2f-8e0c-f34f8778139d'
    r = requests.get(url)
    res = r.json()

    return '.'.join(res['mainSense'].split('#'))
    

def get_score(sentence):
    resp = babelfy(sentence)
    ss = []
    for (w, sy) in resp:
        swn_senti = swn.senti_synset(sy)
        ss.append((w, sy, swn_senti.pos_score(), swn_senti.neg_score() ))
        
    return ss

In [3]:
intensifier_adverb = ['absolutely', 'completely', 'extremely', 'highly', 'rather', 'really', 'very', 'so', 'too', 'totally', 'utterly', 'at all']
negate_adverb = ['no', 'not', 'never', 'none', 'nobody']

In [4]:
def parser(expression):
    old = expression.replace(' ', '_').replace('>_(', '> (').replace(')_(', ') (').replace(')_)', ') )').replace(')_)', ') )').replace(' (', '(')
    new = ''
    flag = False
    for x in range(0, len(old) - 1):        
        if old[x] == '<':
            flag = True
        if old[x] == '>':
            flag = False
            
        if flag == True:
            if old[x] == '(':
                new += '{'
            elif old[x] == ')':
                new += '}'
            else:
                new += old[x]
        else:
            new += old[x]
    new += old[len(old)-1]
    return new


def pos_tag(sentence):
    result = dependency_parser(sentence, properties={"outputFormat": "json", "annotators": "pos"})['sentences']

    res = []
    for iterate in result:
        for pos in iterate['tokens']:
            res.append(pos['pos'])
    return res


def insert_pos_tag(exp, pos, nltk_pos):
    count = 0
    res = ''
    for x in range(0, len(exp)):
        if exp[x] == 'S' and exp[x+1]==' ' and exp[x+2] == 'P':
            res += 'S '
            #print('count :' + str(count))
            #print("pos")
            #print(len(pos))
            #print('nltk pos')
            #print(len(nltk_pos))
            res += pos[count] + ' ' + nltk_pos[count][1]
            count += 1
        else:
            res += exp[x]  
    return res


def direction(exp):
    cont = False
    for x in exp:
        if x == '{':
            cont = True
        elif x == '}':
            cont = False
            continue
        if cont == True:
            continue
        if x == '/':
            return '/'
        elif x == '\\':
            return '\\'
    return False


def is_type_raising(tree):
    tree_string = str(tree)
    
    # check type raising
    exp = tree_string.split('_')[1]
    pattern_1 = r'(.*?)\\(.*?){(.*?)/(.*?)}'
    pattern_2 = r'(.*?)/(.*?){(.*?)\\(.*?)}'
    
    match = False
    if re.search(pattern_1, exp):
        match = True
    elif re.search(pattern_2, exp):
        match = True
        
    sub = []
    for subtree in tree:
        sub.append(subtree)
    if len(sub) == 1 and match:
        return True
    else:
        return False

    
def map_wnpos_to_pennpos(pos):
    if(pos == 'n'):
        return 'NN'
    elif(pos == 'a'):
        return 'JJ'
    elif(pos == 'v'):
        return 'VB'
    elif(pos == 'r'):
        return 'RB'
    
    
def find_word_in_swn(swn_score, word):
    for (wordd, synset, pos_score, neg_score) in swn_score:
        if(wordd == word):
            return (wordd, synset, pos_score, neg_score)
    
    
def polarity_with_score(pos, neg):
    if(pos > neg):
        return ('P', round(pos * 10))
    elif(pos < neg):
        return ('Ne', round(neg*10))
    else:
        return ('N', 0)
    
    
def adverb_type(word):
    if (word in intensifier_adverb):
        return 'I'
    elif(word in negate_adverb):
        return 'Ne'
    else:
        return 'N'
    
    
def pos_majority_voting(corenlp, nltk, babelfy):
    pos = {}
    
    if(corenlp in pos):
        pos[corenlp] += 1
    else:
        pos[corenlp] = 0  
    
    if(nltk in pos):
        pos[nltk] += 1
    else:
        pos[nltk] = 0
        
    if(babelfy in pos):
        pos[babelfy] += 1
    else:
        pos[babelfy] = 0
        
    #find biggeest counter in pos 
    return sorted(pos.items(), key=lambda x: x[1], reverse=True)[0][0];


def chunk(tree):
    # ini cuma masukin ke array 
    chunk = []
    chunk3 = []
    
    for subtree in tree:
        if type(subtree) == nltk.tree.Tree:
            
            # chunk temp array
            subtree_str_array = str(subtree).split('_')
            if subtree_str_array[0][2] == 'L':
                if subtree_str_array[3] == 'NN':
                    chunk.append(subtree_str_array[6])
                    if len(chunk3) == 0:
                        chunk3.append(subtree_str_array[6])
                    
            if len(chunk3) == 1:
                for sub in subtree:
                    # chunk temp array
                    subtree_str_array3 = str(sub).split('_')
                    if subtree_str_array3[0][2] == 'L':
                        if subtree_str_array3[3] == 'NN':
                            chunk3.append(subtree_str_array3[6])
    
    # chunk noun phrase
    if len(chunk) == 2:
        chunk_str = '+'.join(chunk)
        return True, read_expr(r'(' + chunk_str + '_NN_Ne_0)')
    
    if len(chunk3) == 3:
        chunk_str = '+'.join(chunk3)
        return True, read_expr(r'(' + chunk_str + '_NN_Ne_0)')
    
    return False, None


def one_child(tree, swn_score):    
    sub = []
    for subtree in tree:
        if type(subtree) == nltk.tree.Tree:
            sub.append(subtree)
    if len(sub) == 1:
        return True, lambda_calculus(sub[0], swn_score), sub
    
    return False, None, sub


def type_raising(first, second, swn_score):    
    if is_type_raising(first):
        if direction(str(first).split('_')[1]) == '/':
            x = read_expr(r'\F x.F(x, ' + str(lambda_calculus(first, swn_score)) + ')')
            y = lambda_calculus(second, swn_score)
            return True, ApplicationExpression(x, y).simplify()
    
    if is_type_raising(second):
        if direction(str(second).split('_')[1]) == '/':
            x = lambda_calculus(first, swn_score)
            y = read_expr(r'\F x.F(x, ' + str(lambda_calculus(second, swn_score)) + ')')
            return True, ApplicationExpression(x, y).simplify()
        
    return False, None
    

def rule_var(ccg, corenlp_pos, nltk_pos, word, swn_score):
    r_word = ['PRP', 'FW', 'NN', 'LS']
    word_swn_score = find_word_in_swn(swn_score, word)
    pos_score = 0
    neg_score = 0
    babelfy_pos = None;

    if(word_swn_score):
        word, synset, pos_score, neg_score = word_swn_score
        splitted_syns = synset.split('.');
        babelfy_pos = map_wnpos_to_pennpos(splitted_syns[1]);
    polarity, score = polarity_with_score(pos_score, neg_score)
    pos = pos_majority_voting(corenlp_pos, nltk_pos, babelfy_pos)
    
    return rule_leaf(ccg, r_word, pos, word, polarity, score)


def rule_leaf(ccg, r_word, pos, word, polarity, score):
    if pos == 'CC':
        return read_expr(r'CC')
    elif pos in r_word:
        return read_expr(word + '_' + pos + '_' + polarity + '_' + str(score))
    elif 'JJ' in pos:
        return read_expr(word + '_' + 'JJ' + '_' + polarity + '_' + str(score))
    elif 'VB' in pos:
        if '{S[dcl]\\NP}/{S[adj]\\NP}' in ccg:
            return read_expr(r'\x.x')
        else:
            return read_expr(r'\X.' + word + '_' + 'VB' + '_' + polarity + '_' + str(score) +'(X)')
    elif 'RB' in pos:
        #Adverb has three types. I: Intensifier, Ne: Negation, N: no affection toward sentiment
        return read_expr(word + '_' + 'RB' + '_' + adverb_type(word))
    elif word == 'of':
        return read_expr('of')
    else:
        return read_expr(r'\x.x')

    
def lambda_calculus(tree, swn_score):
    tree_string = str(tree)
    
    # leaf
    if tree_string[2] == 'L':
        splitted = tree_string.split('_')
        ccg = splitted[1]
        corenlp_pos = splitted[3]
        nltk_pos = splitted[4]
        word = splitted[6]
        
        return rule_var(ccg, corenlp_pos, nltk_pos, word, swn_score)
    
    # chunk noun phrase
    is_true, res = chunk(tree)
    if is_true:
        return res
            
    # anak 1
    is_true, res, sub = one_child(tree, swn_score)
    if is_true:
        return res
                        
    # urutan operasi lambda calculusnya    
    first = sub[0]
    second = sub[1]
    
    # type raising
    is_true, res = type_raising(first, second, swn_score)
    if is_true:
        return res    
    
    # urutan
    length_1 = len(str(sub[0]).split('_')[1].replace('\\', '/').split('/'))
    length_2 = len(str(sub[1]).split('_')[1].replace('\\', '/').split('/'))
    if length_2 > length_1:
        first = sub[1]
        second = sub[0]
    
    # rekursi
    return deduction(lambda_calculus(first, swn_score), lambda_calculus(second, swn_score))

In [28]:
def bool_var(str_a, str_b):
    is_adj_in_a = re.search(r'JJ.*', str_a)
    is_adj_in_b = re.search(r'JJ.*', str_b)
    
    is_adverb_in_a = re.search(r'RB.*', str_a)
    is_adverb_in_b = re.search(r'RB.*', str_b)    
    
    is_noun_in_b = re.search(r'\w*?\+?\w*?\+?\w*_NN_\w*', str_b)
    is_verb_in_b = re.search(r'VB_\w*_\d*', str_b)
    
    return is_adj_in_a, is_adj_in_b, is_adverb_in_a, is_adverb_in_b, is_noun_in_b, is_verb_in_b


def rule_and(str_a, str_b, is_noun_in_b, is_adj_in_a):
    a, b = None, None
    if (str_b == 'CC'):
        a = read_expr(str_b)
        b = read_expr(str_a)
        return True, a, b
        
    if (re.search(r'CC\(', str_a)) and (re.search(r',', str_a)) and is_noun_in_b and is_adj_in_a:
        a = read_expr(r'\x.x')
        b = read_expr(replacer(str_a, str_b))
        return True, a, b
    
    return False, None, None


def rule_seq(str_a, str_b):
    a, b = None, None
    r_word = ['PRP', 'FW', 'NN', 'LS', 'JJ']
    if '(' not in str_a and '(' not in str_b:
        x = False
        y = False
        for r in r_word:
            if r in str_a:
                x = True
            if r in str_b:
                y = True

        if x and y:
            a = read_expr(r'\x.x')
            b = read_expr('seq(' + str_a + ',' + str_b + ')')
            return True, a, b
    
    return False, None, None


def ini_fungsi_mas_ari_tolong_dipecah_lagi(str_a, str_b, is_adj_in_a, is_adj_in_b, is_adverb_in_a, is_adverb_in_b, is_noun_in_b, is_verb_in_b):
    a, b = None, None
    
    if is_adverb_in_a and (is_noun_in_b and (not is_verb_in_b)):
        #print(is_adverb_in_a)
        #print(is_noun_in_b)
        #print(is_verb_in_b)
        a = read_expr(r'\x.x')
        b = read_expr(r'\x.x')
        return True, a, b

#     elif(is_adj_in_a and is_noun_in_b):
# #         adjective_score = str_a.split('_')[3]
# #         sentiment_polarity = str_a.split('_')[2]
#         pattern = '(\w*_JJ_\w_\d)'
#         adjective_score = re.findall(pattern, str_a)[0].split('_')[3]
#         sentiment_polarity = re.findall(pattern, str_a)[0].split('_')[2]
#         print(adjective_score, sentiment_polarity)
#         #get noun
#         noun_str_b = str_b[ is_noun_in_b.start() : is_noun_in_b.end() ]
#         #change sentiment and polarity
#         def mapFunction(data):
#             idx, x = data
#             if(idx == 2):
#                 return sentiment_polarity
#             elif(idx == 3):
#                 return adjective_score
#             else:
#                 return x
            
#         noun_update_str_b = '_'.join( list(map(mapFunction, enumerate(noun_str_b.split('_')))) )
#         #change str_b for noun filtered with x
#         list_str_b = list(str_b)
#         list_str_b[is_noun_in_b.start() : is_noun_in_b.end()] = 'x'
#         str_b = "".join(list_str_b)
#         print(str_b, noun_update_str_b, is_noun_in_b)
#         str_b = str(ApplicationExpression(read_expr(r"\x." + str_b), read_expr(noun_update_str_b)).simplify())
        
#         a = read_expr(r'\x.x')
#         b = read_expr(str_b)
        
#         return True, a, b
    
    #change identity function for adverb.
    elif( is_adverb_in_a and is_adj_in_b or is_adverb_in_a and is_verb_in_b ):
        #adverb modify adjective
        #example very_RB_I excellent_JJ_P_10 = excellent_JJ_P_20
        #I Intensifier must *2
        #N Negate must *-1
        adverb_type = re.findall('RB_\w*', str_a)[0].split('_')[1]
            
        if(adverb_type == 'N'):
            a = read_expr(r'\x.x')
            b = read_expr(str_b)
        elif(adverb_type == 'Ne'):
            #for negation adverb just change polairty
            def mapFunction(data):
                idx, x = data
                if(idx == 2):
                    if(x == 'P'):
                        return 'Ne'
                    elif(x == 'Ne'):
                        return 'P'
                    else:
                        return x
                else:
                    return x

            str_b = '_'.join( list(map(mapFunction, enumerate(str_b.split('_')))) )
            a = read_expr(r'\x.x')
            b = read_expr(str_b)
            
        elif(adverb_type == 'I'):
            #for intensifier adverb. scale adjective value 
            
            # ini bener?
            jj_s = re.findall('JJ_\w*_\d*', str_b)
            vb_s = re.findall('VB_\w*_\d*', str_b)
            score = '0'
            
            if jj_s:
                score = jj_s[0].split('_')[2]
            else:
                score = vb_s[0].split('_')[2]
                
            def mapFunction(data):
                if(data == score):
                    return str(int(score) * 2);
                else:
                    return data;
            list_str_b = list(str_b)
            str_b = ''.join( list(map(mapFunction, list_str_b)))
#             print(str_b)
            a = read_expr(r'\x.x')
            b = read_expr(str_b)

        return True, a, b
        
    elif( is_adverb_in_a and is_adverb_in_b):
        #adverb modify other adverb
        #kondisi yang jarang bertemu
        adverb_type_in_a = re.findall('RB_\w*', str_a)[0].split('_')[1]
        adverb_type_in_b = re.findall('RB_\w*', str_b)[0].split('_')[1]
        
        if(adverb_type_in_a == 'Ne' and adverb_type_in_b == 'I'):
            #ganti type b menjadi NE
            def mapFunction(data):
                idx, x = data
                if(idx == 2):
                    return 'Ne'
                else:
                    return x

            str_b = '_'.join( list(map(mapFunction, enumerate(str_b.split('_')))) )
            a = read_expr(r'\x.x')
            b = read_expr(str_b)

        elif(adverb_type_in_a == 'I' and adverb_type_in_b == 'Ne'):
            #tidak meruba
            a = read_expr(r'\x.x')
            b = read_expr(str_b)
        else:
            a = read_expr(r'\x.x')
            b = read_expr(str_b)
            
        return True, a, b

    return False, None, None


def deduction(a, b):
    str_a = str(a)
    str_b = str(b)
    print('old a ' + str_a)
    print('old b ' + str_b)
    
    if str_a == 'of':
        a = read_expr(r'\x.x')
        b = read_expr(r'\x.x')
    
    #change identity function for adjective    
    is_adj_in_a, is_adj_in_b, is_adverb_in_a, is_adverb_in_b, is_noun_in_b, is_verb_in_b = bool_var(str_a, str_b)
    
    # fungsi mas ari
    is_true, a_temp, b_temp = ini_fungsi_mas_ari_tolong_dipecah_lagi(str_a, str_b, is_adj_in_a, is_adj_in_b, is_adverb_in_a, is_adverb_in_b, is_noun_in_b, is_verb_in_b)
    if is_true:
        a = a_temp
        b = b_temp
    
    # and
    is_true, a_temp, b_temp = rule_and(str_a, str_b, is_noun_in_b, is_adj_in_a)
    if is_true:
        a = a_temp
        b = b_temp
                
    # sequence
    is_true, a_temp, b_temp = rule_seq(str_a, str_b)
    if is_true:
        a = a_temp
        b = b_temp  
    
    str_a = str(a)
    str_b = str(b)
    print('new a ' + str_a)
    print('new b ' + str_b)
    print('hasil ' + str(ApplicationExpression(a, b).simplify()))
    print()
    return ApplicationExpression(a, b).simplify()


In [6]:
def replacer(a, b):
    #print(a,b)
    pattern = '(\w*?\+?\w*?\+?\w*_NN_)'
    mereplace = re.findall(pattern, b)
    res = ''
    for mreplace in mereplace:
        pattern = '(\w*_JJ_)'
        hasil = re.findall(pattern, a)
        if hasil:
            res = a.replace(hasil[0], mreplace)
        if len(hasil) > 1:
            for index in range(1, len(hasil)):
                res = res.replace(hasil[index], mreplace)
    return res

In [29]:
def preprocessing(sentence):
    res = re.sub(' +', ' ', re.sub(r'[^\w\s]','',sentence.replace("'m", "am").replace("n't", "not")))
    return res

In [24]:
def glue_process(sent):
    preprocessed = preprocessing(sent)
    
    url = "http://localhost:5000/ccgParsing"
    data = {"sent": preprocessed}
    r = requests.post(url, data=data)

    res = r.json()
    
    from_res = res['tree']
    
    text = nltk.word_tokenize(preprocessed)
    nltk_pos = nltk.pos_tag(text)
    
    pos_tagged = insert_pos_tag(from_res, pos_tag(data['sent']), nltk_pos)
    hasil = parser(pos_tagged)

    tree = Tree.fromstring(hasil)
    print("hit2222")
    swn_score = get_score(preprocessed)
    
#     print(tree)
    return lambda_calculus(tree, swn_score)

In [9]:
import pandas as pd

def semua(collection):
    df = pd.DataFrame(columns=['sentence', 'lambda', 'raw_aspect', 'aspect', 'sentiment'])
    df_err = pd.DataFrame(columns=['sentence'])
    for data in collection:
        try:
            hasil = glue_process(data)
        
            pattern = '(\w*?\+?\w*_NN_[P|Ne]_\d)'
            aspek = re.findall(pattern, str(hasil))
            aspect = []
            sentiment = []
            temp = ''
            for asp in aspek:
                temp = asp.split('_')
                aspect.append(temp[0].replace('+', ' '))
                sentiment.append(1 if temp[2] == 'P' else 0)

            #print('------------------------')
            #print(aspect, sentiment)
            #print('------------------------')
            df = df.append({'sentence': data, 'lambda': hasil,'raw_aspect': aspek, 'aspect': aspect, 'sentiment': sentiment}, ignore_index=True)
        except:
            df_err = df_err.append({'sentence': data}, ignore_index=True)
        
    df.to_csv('hasil_ccg.csv')
    df_err.to_csv('hasil_err_ccg.csv')
    return df

In [10]:
# input file
df = pd.read_csv("dataset.csv")
# preprocess
sentences = df['review']
labels = df["target"]

semua(sentences[:300])

Unnamed: 0,sentence,lambda,raw_aspect,aspect,sentiment
0,i recently purchased the canon powershot g3 an...,"CC(satisfied_VB_P_10(purchase_NN_N_0),purchase...",[purchase_NN_N_0],[purchase],[0]
1,"the camera is very easy to use , in fact on a ...",easy_JJ_P_12(was_VB_N_0(asked_VB_N_0(take_VB_N...,"[picture_NN_N_0, week_NN_N_0, trip_NN_N_0, fac...","[picture, week, trip, fact, camera]","[0, 0, 0, 0, 0]"
2,"after i took their picture with their camera ,...","camera_NN_N_0(took_VB_N_0(picture_NN_N_0),i_FW...","[camera_NN_N_0, picture_NN_N_0, picture_NN_N_0]","[camera, picture, picture]","[0, 0, 0]"
3,"i just told them , press halfway , wait for th...","told_VB_N_0(them_PRP_N_0,wait_VB_N_0(box_NN_N_...","[box_NN_N_0, press_NN_N_0]","[box, press]","[0, 0]"
4,they fired away and the picture turned out qui...,"CC(have_VB_P_2(\x.x,\x.x,\X.turned_VB_N_0(X),p...",[picture_NN_N_0],[picture],[0]
5,a few of my work constituants owned the g2 and...,CC(recommended_VB_Ne_2(seq(picture+quality_NN_...,"[canon_NN_N_0, g2_NN_N_0]","[canon, g2]","[0, 0]"
6,i 'm easily enlarging pictures to 8 1/2 x 11 w...,"m_NN_N_0(CC(using_VB_N_0(seq(x_NN_N_0,x_NN_P_5...","[m_NN_N_0, x_NN_N_0, x_NN_P_5, setting_NN_N_0]","[m, x, x, setting]","[0, 0, 1, 0]"
7,"bottom line , well made camera , easy to use ,...",camera_NN_N_0(line_NN_P_6(use_VB_N_0(include_V...,"[camera_NN_N_0, line_NN_P_6, ability_NN_N_0, l...","[camera, line, ability, lense, flash, line, line]","[0, 1, 0, 0, 0, 1, 1]"
8,i 'd highly recommend this camera for anyone w...,d_FW_N_0(is_VB_N_0(looking_VB_N_0(CC(get_VB_N_...,"[quality_NN_N_0, flexibility_NN_P_4, anyone_NN...","[quality, flexibility, anyone, camera]","[0, 1, 0, 0]"
9,great job canon !,"seq(great_JJ_N_0,job+canon_NN_Ne_0)",[],[],[]


In [None]:
# glue_process('i recently purchased the canon powershot g3 and am extremely satisfied with the purchase')

In [None]:
# glue_process('the breakfast that the restaurant served daily was excellent')

In [None]:
# glue_process('i bought my canon g3 about a month ago and i have to say i am very satisfied')

In [22]:
for a in sentences[4:10]:
    print(preprocessing(a))

they fired away and the picture turned out quite nicely   as all of my pictures have thusfar   
a few of my work constituants owned the g2 and highly recommended the canon for picture quality  
i m easily enlarging pictures to 8 12 x 11 with no visable loss in picture quality and not even using the best possible setting as yet  super fine   
ensure you get a larger flash  128 or 256  some are selling with the larger flash  32mb will do in a pinch but you ll quickly want a larger flash card as with any of the 4mp cameras  
bottom line  well made camera  easy to use  very flexible and powerful features to include the ability to use external flash and lense  filters choices  
i d highly recommend this camera for anyone who is looking for excellent quality pictures and a combination of ease of use and the flexibility to get advanced with many options to adjust if you like  


In [11]:
# glue_process("i 'm easily enlarging pictures to 8 1/2 x 11 with no visable loss in picture quality and not even using the best possible setting as yet ( super fine )")

In [30]:
preprocessing("i did n't think i would find this quality and ease of use for under $ 1500 - i 'm thrilled with my purchase !")

'i did not think i would find this quality and ease of use for under 1500 i am thrilled with my purchase '

In [13]:
# glue_process("they fired away and the picture turned out quite nicely")