In [1]:
import sys, os
import re
import pandas as pd
import itertools, nltk, string 
from nltk.parse.corenlp import CoreNLPDependencyParser

parser = CoreNLPDependencyParser()

def read_file(file):
    f = open(file, 'r')
    pattern_title = '\[t\]'
    pattern_sentence = '(?<=##).+'
    pattern_aspect = '.+(?=##)'
    review = []
    for a in f:
        if re.search('##', a):
            sentence = re.findall(pattern_sentence, a)[0]
            aspect = re.findall(pattern_aspect, a)
            flag = True
            if len(aspect) > 0:
                aspect = aspect[0]
            else:
                aspect = ''
            if flag:
                review.append((sentence, aspect))
    df = pd.DataFrame(columns=['review','target'])
    for r in review:
        df = df.append({'review': r[0], 'target': r[1]}, ignore_index=True)
    return df
        
positive_lexicon = []
negative_lexicon = []

def read_lexicon():
    global positive_lexicon;
    global negative_lexicon;    
    with open(os.path.join(os.path.abspath('opinion-lexicon-English/') , 'positive-words.txt'), 'r') as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
        positive_lexicon = file.readlines()
    with open(os.path.join(os.path.abspath('opinion-lexicon-English/') , 'negative-words.txt'), 'r', encoding = "ISO-8859-1") as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
        negative_lexicon = file.readlines()
    positive_lexicon = list(map(lambda word: word.rstrip("\n\r"), positive_lexicon))
    negative_lexicon = list(map(lambda word: word.rstrip("\n\r"), negative_lexicon))
        
read_lexicon()
df = read_file('dataset/bing_liu/Nokia 6610.txt')

import os
os.environ["CORENLP_HOME"] = r'C:\stanford-corenlp-full-2018-10-05'

import corenlp 
client = corenlp.CoreNLPClient()

def preprocess(sentences, chunk = False):
    res = []
    for sentence in sentences:
        text = sentence.lower()
        if chunk:
            try:
                pattern = '[{pos:NN}][{pos:NN}][{pos:NN}] | [{pos:NN}][{pos:NN}]'
                matches = client.tokensregex(sentence, pattern)
                response = matches['sentences'][0]
                len_chunk = response['length']
                if len_chunk > 0:
                    for index in range(0, len_chunk):
                        replacer = response[str(index)]['text']
                        text = text.replace(replacer, '-'.join(replacer.split(' ')))
            except:
                text = text
        res.append(text)
    return res

import requests
import re

In [2]:
import requests

def get_tregex(text, tregex):
    url = "http://localhost:9000/tregex"
    request_params = {"pattern": tregex}
    try:
        r = requests.post(url, data=text, params=request_params, timeout = 120)
    except:
        print('err 1', text)
        return []
    
    try:
        return r.json()['sentences'][0]
    except:
        print('err 2', r, text)
        return []

def sentence_from_tree(s):
    pattern = r'(?<= )[a-zA-Z].*?(?=\))'
    replaced = s.replace('\r\n', '')
    res = ' '.join(re.findall(pattern, replaced))
    return res
        
def sentence_type(clauses):
    IC = 0
    DC = 0
    for clause in clauses:
        if(clause[1] == 'IC'):
            IC += 1
        elif(clause[1] == 'DC'):
            DC += 1

    if IC == 1 and DC == 0:
        return 'simple_sentence'
    elif IC >= 2 and DC == 0:
        return 'compound_sentence'
    elif IC ==1 and DC >= 1:
        return 'complex_sentence'
    elif IC > 1 and DC >= 1:
        return 'compound_complex_sentence'
    else:
        return 'phrase'
    
def get_clauses(sentence):
    temp = []
    clauses = []
    
    res_all_clauses = get_tregex(sentence, 'S < (NP $ VP)') 
    res_sbar_clause = get_tregex(sentence, 'SBAR < S')
    #filter clauses with dependency clauses
    for x in range(0, len(res_all_clauses)):
        s = sentence_from_tree(res_all_clauses[str(x)]['match'])
        ic = True    
        for y in range(0, len(res_sbar_clause)):
            sbar = sentence_from_tree(res_sbar_clause[str(y)]['match'])            
            if sbar in s and sbar != s:
                s = s.replace(sbar, '')
                if(len(res_sbar_clause) == 1 and sbar != ''):
                    temp.append([sbar.strip(), 'DC'])
            elif s in sbar and sbar != '':
                ic = False
                temp.append( [sbar.strip(), 'DC'])
        if ic:
            temp.append( [s.strip(), 'IC'] )

    #overwrite sentence that already exist in list
    len_clause = len(temp)
    for x in range(0, len_clause):
        for y in range(x + 1, len_clause):
            temp[x][0] = temp[x][0].replace(temp[y][0], '').strip()
        
        temp[x][0] = re.sub(r"  ", " ", temp[x][0])
        if(temp[x][0] != ''):
            clauses.append( tuple(temp[x]) )
    #sorted by index sentence
    
    if(len(clauses) == 0):
        clauses.append((sentence, 'Phrase'))
    
    return sorted(clauses, key=lambda clause: 999 if sentence.find(clause[0]) == -1 else sentence.find(clause[0]))

In [4]:
import requests
import re

def get_tregex(text):
    url = "http://localhost:9000/tregex"
    request_params = {"pattern": "S < (NP $ VP)"}
    try:
        r = requests.post(url, data=text, params=request_params)
    except:
        print('err 1', text)
        return []
    
    try:
        return r.json()['sentences'][0]
    except:
        print('err 2', r, text)
        return []

def sentence_from_tree(s):
    p_wh = r'(?<=WHADVP).*?(?=\))'
    p_wh2 = r'(?<=WHNP).*?(?=\))'
    pattern = r'(?<= )[a-zA-Z].*?(?=\))'
    replaced = s.replace('\r\n', '')
    wh = re.findall(p_wh, replaced)
    wh2 = re.findall(p_wh2, replaced)
    for x in wh:
        replaced = replaced.replace(x, '')
    for x in wh2:
        replaced = replaced.replace(x, '')
    res = ' '.join(re.findall(pattern, replaced))
    return res

def get_clauses(sentences):
    clauses = []
    res = get_tregex(sentences)
    length = len(res)
    for x in range(0, length):
        clauses.append(sentence_from_tree(res[str(x)]['match']))
    if length > 1:
        for x in range(0, length - 1):
            clauses[x] = clauses[x].replace(clauses[x+1], '')
    return clauses

In [4]:
NN = ['NN']
JJ = ['JJ']

def rule_1_1(parse, f, o_expanded):
    #Rule 1.1
#     print(dep_DP)
    for (w1, dep, w2) in list(parse.triples()):
        if(dep in dep_DP):
            # Rule 1.1
            if(w1[0] in o_expanded):
                if w2[1] in NN and w2[0] not in f:
                    return True, w2[0], w1[0]
            elif(w2[0] in o_expanded):
                if w1[1] in NN and w1[0] not in f:
                    return True, w1[0], w2[0]
    return False, None, None

def rule_1_2(parse, f, o_expanded):
    # Rule 1.2
    for (w1, dep, w2) in parse.triples():
        if(dep in dep_DP):
            H = ''
            O = ''
            if w1[0] in o_expanded:
                H = w2[0]
                O = w1
            elif w2[0] in o_expanded:
                H = w1[0]
                O = w2

            if H:
                for (w1, dep, w2) in list(parse.triples()):
                    if w1[0] == H and w2[0] != O[0]:
                        if w2[1] in NN and w2[0] not in f:
                            return True, w2[0], O[0]
                    elif w2[0] == H  and w1[0] != O[0]:
                        if w1[1] in NN and w1[0] not in f:
                            return True, w1[0], O[0]
    return False, None, None

def rule_4_1(parse, f, o_expanded):
    # Rule 4.1
    for (w1, dep, w2) in list(parse.triples()):
        if(dep in conj_DP):
            if w1[0] in o_expanded:
                if w2[1] in JJ and w2[0] not in o_expanded:
                    return True, w2[0]

            elif w2[0] in o_expanded:
                if w1[1] in JJ and w1[0] not in o_expanded:
                    return True, w1[0]

    return False, None

def rule_4_2(parse, f, o_expanded):
    # Rule 4.2
    for (w1, dep, w2) in parse.triples():
        if(dep in dep_DP or dep in conj_DP):
            H = ''
            O = ''
            if w1[0] in o_expanded:
                H = w2[0]
                O = w1
            elif w2[0] in o_expanded:
                H = w1[0]
                O = w2

            if H:
                for (w1, dep, w2) in list(parse.triples()):
                    if w1[0] == H and w2[0] != O[0]:
                        if w2[1] in JJ and w2[0] not in o_expanded:
                            return True, w2[0]

                    elif w2[0] == H  and w1[0] != O[0]:
                        if w1[1] in JJ and w1[0] not in o_expanded:
                            return True, w1[0]
    
    return False, None

def rule_3_1(parse, f):
    #Rule 3.1
    for (w1, dep, w2) in list(parse.triples()):
        if(dep in conj_DP):
            if(w1[0] in f): 
                if w2[1] in NN and w2[0] not in f:
                    return True, w2[0], w1[0]
            elif(w2[0] in f):          
                if w1[1] in NN and w1[0] not in f:
                    return True, w1[0], w2[0]
                
    return False, None, None

def rule_3_2(parse, f):
    # Rule 3.2
    for (w1, dep, w2) in parse.triples():
        if(dep in dep_DP or dep in conj_DP):
            H = ''
            O = ''
            if w1[0] in f:
                H = w2[0]
                O = w1
            elif w2[0] in f:
                H = w1[0]
                O = w2

            if H:
                for (w1, dep, w2) in list(parse.triples()):
                    if w1[0] == H and w2[0] != O[0]:
                        if w2[1] in NN and w2[0] not in f:
                            return True, w2[0], O[0]
                    elif w2[0] == H  and w1[0] != O[0]:
                        if w1[1] in NN and w1[0] not in f:
                            return True, w1[0], O[0]

    return False, None, None

def rule_2_1(parse, f, o_expanded):
    # Rule 2.1
    for (w1, dep, w2) in list(parse.triples()):
        if(dep in dep_DP):
            if w1[0] in f:
                if w2[1] in JJ and w2[0] not in o_expanded:
                    return True, w2[0]

            elif w2[0] in f:
                if w1[1] in JJ and w1[0] not in o_expanded:
                    return True, w1[0]
                
    return False, None

def rule_2_2(parse, f, o_expanded):      
    # Rule 2.2
    for (w1, dep, w2) in parse.triples():
        if(dep in dep_DP):
            H = ''
            O = ''
            if w1[0] in f:
                H = w2[0]
                O = w1
            elif w2[0] in f:
                H = w1[0]
                O = w2

            if H:
                for (w1, dep, w2) in list(parse.triples()):
                    if w1[0] == H and w2[0] != O[0]:
                        if w2[1] in JJ and w2[0] not in o_expanded:    
                            return True, w2[0]
                    elif w2[0] == H  and w1[0] != O[0]:
                        if w1[1] in JJ and w1[0] not in o_expanded:
                            return True, w1[0]

    return False, None

def clause(other_clause, temp, parse, f):
    res = []
    
    if other_clause and len(temp) == 0:
        for (w1, dep, w2) in list(parse.triples()):
            if w1[1] == 'NN' and w1[0] not in f:
                res.append(w1[0])
                
    if len(res) > 0:
        return True, res
    return False, None

In [5]:
import pandas as pd
candidate_aspect = []
new_opinion = []
op_set = positive_lexicon + negative_lexicon

def double_propagation(O: op_set, reviews, targets, output_file, save_to_file, using_clause):
    o_expanded = O
    f = []
    is_stop = False
    flag_cycle = 0
    
    t_a_p = []
    pair_a_o = []
    a_p = []
    r_p = []
    
    stop_a = 0
    stop_o = 0
    
    while (not is_stop):    
        index = 0
        stop_a = len(f)
        stop_o = len(o_expanded)

        for sentence in reviews:
            clauses = []
            if using_clause and flag_cycle == 0:
                temp = get_clauses(sentence)
                length = len(temp)
                
                if length == 0:
                    clauses.append((sentence, ''))
                else:
                    for x in range(length - 1, -1, -1):
                        clauses.append(temp[x])
            else:
                clauses.append((sentence, ''))
            
            temp = []
            other_clause = False
            clause_op_word = ''
            pair = []

            for r in clauses:
                temp2 = []

                try:
                    parse = next(parser.raw_parse(r))
                except:
                    continue

                aspect_sentence = []
                if flag_cycle:
                    aspect_sentence = t_a_p[index].split('|')

                # Rule 1.1
                is_true, t_a, o = rule_1_1(parse, aspect_sentence, o_expanded)
                if is_true:
                    f.append(t_a)
                    temp.append(t_a)
                    temp2.append(t_a)
                    pair.append(t_a + '!' + o)

                # Rule 1.2
                is_true, t_a, o = rule_1_2(parse, aspect_sentence, o_expanded)
                if is_true:
                    f.append(t_a)
                    temp.append(t_a)
                    temp2.append(t_a)
                    pair.append(t_a + '!' + o)

                # Rule 4.1
                is_true, t_o = rule_4_1(parse, f, o_expanded)
                if is_true:
                    o_expanded.append(t_o)

                # Rule 4.2
                is_true, t_o = rule_4_2(parse, f, o_expanded)
                if is_true:
                    o_expanded.append(t_o)

                # Rule 3.1
                is_true, t_a, o = rule_3_1(parse, aspect_sentence)
                if is_true:
                    f.append(t_a)
                    temp.append(t_a)
                    temp2.append(t_a)
                    pair.append(t_a + '!' + o)

                # Rule 3.2
                is_true, t_a, o = rule_3_2(parse, aspect_sentence)
                if is_true:
                    f.append(t_a)
                    temp.append(t_a)
                    temp2.append(t_a)
                    pair.append(t_a + '!' + o)

                # Rule 2.1
                is_true, t_o = rule_2_1(parse, f, o_expanded)
                if is_true:
                    o_expanded.append(t_o)

                # Rule 2.2
                is_true, t_o = rule_2_2(parse, f, o_expanded)
                if is_true:
                    o_expanded.append(t_o)
                    
                if using_clause and flag_cycle == 0:
                    flag_o = False
#                     if r[1] == 'DC':
#                         for word in r[0].split(' '):
#                             if word in o_expanded:
#                                 clause_op_word = word
#                                 flag_o = True
#                                 break
#                     else:
                    for word in r.split(' '):
                        if word in o_expanded:
                            clause_op_word = word
                            flag_o = True
                            break

                    # clause
                    is_true, t_a = clause(other_clause, temp, parse, aspect_sentence)
                    if is_true:
                        for x in t_a:
                            f.append(x)
                            temp.append(x)
                            pair.append(x + '!' + clause_op_word)
                            break

                    if flag_o and len(temp2) == 0:
                        other_clause = True
            
            if flag_cycle == 0:

                r_p.append(sentence)
                a_array = []
                tes = []
                
                try:
                    tes = targets[index].split(', ')
                except:
                    tes = []
                for x in tes:
                    splitted = x.split('[')
                    if len(splitted) > 1:
                        a_array.append(splitted[0] + '!' + splitted[1][0])
                    else:
                        a_array.append(splitted[0])

                a_p.append('|'.join(a_array))
                t_a_p.append('|'.join(temp))
                pair_a_o.append('|'.join(pair))
            else:
                if len(temp) != 0:
                    t_a_p[index] += '|' + '|'.join(temp)
                    pair_a_o[index] += '|' + '|'.join(pair)
            index += 1

        flag_cycle = 1
        
        if(len(f) == stop_a and len(o_expanded) == stop_o):
            if save_to_file == True:
                out = pd.DataFrame(r_p)
                out['aspect'] = a_p
                out['prediction'] = pair_a_o
                out.to_csv(output_file)
            is_stop = True
        
    return f, o_expanded

In [6]:
import pandas as pd
import re
import math
import numpy as np

def run(filename, save_file, path):
    df = read_file(filename)
        
    dp_aspect, opinion_expand = double_propagation(op_set, preprocess(df['review'], True),
                                                   df['target'], path + save_file, True, True)

In [7]:
import pandas as pd

filename = [
#             'Apex AD2600 Progressive-scan DVD player',
#             'Canon G3',
#             'Creative Labs Nomad Jukebox Zen Xtra 40GB',
#             'Nikon coolpix 4300',
#             'Nokia 6610',
#     'test data'
]

conj_DP = ['conj']
dep_DP = ['amod', 'prep', 'nsubj', 'csubj', 'xsubj', 'dobj', 'iobj']

for file in filename:
    print(file)
    run('dataset/bing_liu/' + file + '.txt', file + '-new-' + '.csv', 'hasil_clause/')

# dep_DP = ['amod', 'prep', 'nsubj', 'csubj', 'xsubj', 'dobj', 'iobj',
#           'advmod', 'dep', 'cop', 'mark', 'nsubjpass', 'pobj', 'acomp', 'xcomp', 'csubjpass', 'poss']
# for file in filename:
#     print(file)
#     run('dataset/bing_liu/' + file + '.txt', file + '.csv', 'hasil_clause_plus/')

Nokia 6610
err 2 <Response [500]> i am a business-user who heavily depend on mobile service . 
err 2 <Response [500]> i am a business-user who heavily depend on mobile service . 
err 2 <Response [500]> i have had the phone for 1 week , the signal-quality has been great in the detroit-area ( suburbs ) and in my recent road-trip between detroit and northern kentucky ( cincinnati ) i experienced perfect signal and reception along i-75 , far superior to at &#38; t 's which does not work along several long stretches on that same route . 
err 2 <Response [500]> i have had the phone for 1 week , the signal-quality has been great in the detroit-area ( suburbs ) and in my recent road-trip between detroit and northern kentucky ( cincinnati ) i experienced perfect signal and reception along i-75 , far superior to at &#38; t 's which does not work along several long stretches on that same route . 


In [14]:
get_clauses("the zoom goes up and down in magnification , which is nice , but there is always a display of the zoom-size about one-quarter of the way down from the top of the screen that interferes with viewing at most settings . ")

[('the zoom goes up and down in magnification', 'IC'),
 ('there is always a display of the zoom-size about one-quarter of the way down from the top of the screen',
  'IC')]

In [17]:
get_clauses("the panel seems like it will come off very easily as other people have said . ")

[('the panel seems', 'IC'),
 ('like it will come off very easily', 'DC'),
 ('as other people have said', 'DC')]

In [27]:
get_clauses("it does have a lense-cap , but it wo n't let you take pics with it on which is real good .  ")

[('it does have a lense-cap', 'IC'),
 ("it wo n't let", 'IC'),
 ('you take pics with it on', 'IC'),
 ('which is real good', 'DC')]

In [26]:
get_clauses("the player is working it was great")

[('the player is working', 'IC'), ('it was great', 'DC')]