Read dataset

In [1]:
import sys, os
import re
import pandas as pd
import itertools, nltk, string 
nltk.download('wordnet')
#from transforms import flatten_deeptree

rx_dict = {
    'title': re.compile(r'\[t\](?P<title>.*)'),
    'review': re.compile(r'(?P<aspect>.*)##(?P<review>.*)')
}

def parse_data(file, data, reviews=[], aspects=[]):
    line = file.readline();
    if(line):
        match_title = rx_dict['title'].search(line);
        if match_title:
            data['title'].append(match_title.group('title'))
            data['domain'].append('canon g3')
            if(len(reviews) > 0 or len(aspects) > 0):
                data['review'].append("".join(reviews))
                data['aspect'].append(", ".join(aspects))
                reviews = []
        
        match_review = rx_dict['review'].search(line)
        if match_review:
            review_text = match_review.group('review');
            aspect_text = match_review.group('aspect');
            
            if(review_text):
                reviews.append(review_text);
            
            if(aspect_text):
                aspects.append(aspect_text);
                
        parse_data(file, data, reviews, aspects)
    else:
        if(len(reviews) > 0 or len(aspects) > 0):
            data['review'].append("".join(reviews))
            data['aspect'].append(", ".join(aspects))
    
data = {
    'title': [],
    'review': [],
    'aspect': [],
    'domain': []
}
    
def read_file():
    with open(os.path.join(os.path.abspath('dataset/bing_liu/') , 'Nikon coolpix 4300.txt'), 'r') as file:
        parse_data(file, data);
        
positive_lexicon = []
negative_lexicon = []

def read_lexicon():
    global positive_lexicon;
    global negative_lexicon;
    
    with open(os.path.join(os.path.abspath('opinion-lexicon-English/') , 'positive-words.txt'), 'r') as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
         
        positive_lexicon = file.readlines()
    
    with open(os.path.join(os.path.abspath('opinion-lexicon-English/') , 'negative-words.txt'), 'r', encoding = "ISO-8859-1") as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
        
        negative_lexicon = file.readlines()
        
    positive_lexicon = list(map(lambda word: word.rstrip("\n\r"), positive_lexicon))
    negative_lexicon = list(map(lambda word: word.rstrip("\n\r"), negative_lexicon))
    
        
read_file()
read_lexicon()

[nltk_data] Downloading package wordnet to C:\Users\Farza
[nltk_data]     Nurifan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import os
os.environ["CORENLP_HOME"] = r'C:\stanford-corenlp-full-2018-10-05'

import corenlp 
client = corenlp.CoreNLPClient()

def chunk_check(text, word):
    try:
        pattern = '{tag:/NN.*/} <compound {word:'+ word +';tag:/NN.*/}'
        matches = client.semgrex(text, pattern)
        res = matches['sentences']
        if len(res) == 1:
            if res[0]['length'] == 0:
                return word
        return res[0]['0']['text'] + ' ' + word
    except:
        return word

Loading Stanford CoreNLP

In [3]:
from nltk.parse.corenlp import CoreNLPDependencyParser

parser = CoreNLPDependencyParser()
parse = next(parser.raw_parse("my name is khan"))

In [4]:
def extract_candidate_chunks(text, grammar=r'''NP: {<NN.*><JJ>?<IN>?<PRP.*>?<NN.*>} 
                                            ...AP: {<JJ.*><.*>?<VB.*>+}'''):
    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents=[]
    for sent in nltk.sent_tokenize(text):
        tagged_sents.append(nltk.pos_tag(nltk.word_tokenize(sent)))
    
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda chunk: chunk[2] != 'O') if key]

    return [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]


In [5]:
import pandas as pd

df = pd.read_csv("dataset.csv")
df['review'][596]

'rather heavy for point and shoot but a great camera for semi pros . '

In [32]:
import requests
import re

def get_tregex(text):
    url = "http://localhost:9000/tregex"
    request_params = {"pattern": "S < (NP $ VP)"}
    r = requests.post(url, data=text, params=request_params)
    print(r)
    try:
        return r.json()['sentences'][0]
    except:
        return []

def sentence_from_tree(s):
    p_wh = r'(?<=WHADVP).*?(?=\))'
    p_wh2 = r'(?<=WHNP).*?(?=\))'
    pattern = r'(?<= )[a-zA-Z].*?(?=\))'
    replaced = s.replace('\r\n', '')
    wh = re.findall(p_wh, replaced)
    wh2 = re.findall(p_wh2, replaced)
    for x in wh:
        replaced = replaced.replace(x, '')
    for x in wh2:
        replaced = replaced.replace(x, '')
    res = ' '.join(re.findall(pattern, replaced))
    return res

def get_clauses(sentences):
    clauses = []
    res = get_tregex(sentences)
    length = len(res)
    for x in range(0, length):
        clauses.append(sentence_from_tree(res[str(x)]['match']))
    if length > 1:
        for x in range(0, length - 1):
            clauses[x] = clauses[x].replace(clauses[x+1], '')
    return clauses

## Double Propagation

### Rule 1.1 if a word A, whose POS is NN, is depended by an opinion word O through Dep, where Dep is one of the dependency relations amod, prep, nsubj, csubj, xsubj, dobj, and iobj, then A is an aspect.

### Rule 1.2 if an opinion word O and a word A, whose POS is NN, depend on a third word H through dependency relations Depi and Depj respectively, where Depi and Dep j are one of the relations amod, prep, nsubj, csubj, xsubj, dobj, and iobj, then A is an aspect.

### Rule 2.1 if a word O, whose POS is JJ (adjective), directly depends on an aspect A through dependency relation Dep, where Dep is one of the dependency rela- tions amod, prep, nsubj, csubj, xsubj, dobj, and iobj, then O is an opinion word.

### Rule 2.2 if a word O, whose POS is JJ, and an aspect A, directly depend on a third word H through relations Depi and Depj respectively, where Depi and Depj are one of the relations amod, prep, nsubj, csubj, xsubj, dobj, and iobj, then O is an opinion word.

### Rule 3.1 if a word Aj, whose POS is NN, directly depends on an aspect Ai through conj, then A j is an aspect

### Rule 3.2 if a word Aj, whose POS is NN, and an aspect Ai, directly depend on a third word H through the dependency relations De pi and De p j , where De pi and Depj are one of the relations in amod, prep, nsubj, csubj, xsubj, dobj, and conj, then A j is an aspect


### Rule 4.1 if a word Oj, whose POS is JJ, directly depends on an opinion word Oi through conj, then O j is an opinion word. 

### Rule 4.2 if a word Oj, whose POS is JJ, and an opinion word Oi, directly depend on a third word H through the dependance relations De pi and De p j , where Dep1 and Dep2 are one of the relations in amod, prep, nsubj, csubj, xsubj, dobj, and conj, then O j is an opinion word

In [7]:
# adding new dependencies for DP 'nmod' and'advmod'
dep_DP = ['amod', 'prep', 'nsubj', 'csubj', 'xsubj', 'dobj', 'iobj', 'advmod']
conj_DP = ['conj']

In [54]:
from pycorenlp import StanfordCoreNLP
import json
from nltk import Tree
nlp = StanfordCoreNLP('http://localhost:9000')
dependency_parser = nlp.annotate
def chunking(text):
    chunking_noun = extract_candidate_chunks(text, r'NP: {<NN.*|JJ.*><.*>?<NN>}') 
    chunking_adj = extract_candidate_chunks(text, r'AP: {<JJ.*|RB.*><TO>?<VB.*>}') 
  #  nlpParser.parser_annotators = 'tokenize,ssplit,pos,lemma,depparse'
    
    output = nlp.annotate(text, properties={
      'annotators': 'tokenize,ssplit,pos,depparse,parse',
      'outputFormat': 'json'
      })
    
    pp = Tree.fromstring(output['sentences'][0]['parse'])
   
    new_chunking = []
    for c in chunking_noun:
        for i in pp.subtrees(filter=lambda x: x.label() == 'NP'):
            lls = i.leaves()
            s = len(lls)
            match = 0
            for xx in lls:
                if xx in c:
                    match += 1

            if(match >= (s/2)):
                new_chunking.append(c)
        
                break

    for c in chunking_adj:
        #chunk_a.append(c)
        for i in pp.subtrees(filter=lambda x: x.label() == 'ADJP'):
            lls = i.leaves()
            s = len(lls)
            match = 0
            for xx in lls:
                if xx in c:
                    match += 1

            if(match >= (s/2)):
                new_chunking.append(c)
        
                break
        #new_chunking.append(c)

    for chunk in new_chunking:
        text = text.replace(chunk, '-'.join(chunk.split(' ')))
    
    return text

In [55]:
import pandas as pd
candidate_aspect = []
new_opinion = []
op_set = positive_lexicon + negative_lexicon

def double_propagation(O: op_set, reviews, using_chunking=True, using_objective_detection=False, save_to_file=False):
    o_expanded = O
    f = []
    is_stop = False
    flag_cycle = 0
    
    t_a_p = []
    a_p = []
    r_p = []
    
    while (not is_stop):
        f_i = []
        o_i = []
    
        index = 0
        for sent in reviews:
            sentences = []
            clauses = get_clauses(sent)
            length = len(clauses)
            if length == 0:
                sentences.append(sent)
            else:
                for x in range(length - 1, -1, -1):
                    sentences.append(clauses[x])
            
            other_clause = False
            for r in sentences:
                
                temp = []
                
                if using_chunking:
                    r = chunking(r)
                else:
                    r = r
                
                try:
                    parse = next(parser.raw_parse(r))
                except:
                    continue

                flag_o = False
                for word in r:
                    if word in o_expanded:
                        flag_o = True
                        break
                    
                #Rule 1.1
                for (w1, dep, w2) in list(parse.triples()):
                    if(dep in dep_DP):
                        # Rule 1.1
                        if(w1[0] in o_expanded):
                            if w2[1] == 'NN' and w2[0] not in f:
                                f_i.append(w2[0])
                                temp.append(w2[0])
                        elif(w2[0] in o_expanded):          
                            if w1[1] == 'NN' and w1[0] not in f:
                                f_i.append(w1[0])
                                temp.append(w1[0])



                # Rule 1.2
                for (w1, dep, w2) in parse.triples():
                    if(dep in dep_DP):
                        H = ''
                        O = ''
                        if w1[0] in o_expanded:
                            H = w2[0]
                            O = w1
                        elif w2[0] in o_expanded:
                            H = w1[0]
                            O = w2

                        if H:
                            for (w1, dep, w2) in list(parse.triples()):
                                if w1[0] == H and w2[0] != O[0]:
                                    if w2[1] == 'NN' and w2[0] not in f:
                                        f_i.append(w2[0])
                                        temp.append(w2[0])

                                elif w2[0] == H  and w1[0] != O[0]:
                                    if w1[1] == 'NN' and w1[0] not in f:
                                        f_i.append(w1[0])
                                        temp.append(w1[0])


                # Rule 4.1
                for (w1, dep, w2) in list(parse.triples()):
                    if(dep in conj_DP):
                        if w1[0] in o_expanded:
                            if w2[1] == 'JJ' and w2[0] not in o_expanded:
                                o_i.append(w2[0])

                        elif w2[0] in o_expanded:
                            if w1[1] == 'JJ' and w1[0] not in o_expanded:
                                o_i.append(w1[0])


                # Rule 4.2
                for (w1, dep, w2) in parse.triples():
                    if(dep in dep_DP or dep in conj_DP):
                        H = ''
                        O = ''
                        if w1[0] in o_expanded:
                            H = w2[0]
                            O = w1
                        elif w2[0] in o_expanded:
                            H = w1[0]
                            O = w2

                        if H:
                            for (w1, dep, w2) in list(parse.triples()):
                                if w1[0] == H and w2[0] != O[0]:
                                    if w2[1] == 'JJ' and w2[0] not in o_expanded:                  
                                        o_i.append(w2[0])
                                elif w2[0] == H  and w1[0] != O[0]:
                                    if w1[1] == 'JJ' and w1[0] not in o_expanded:
                                        o_i.append(w1[0])
                                        
                if other_clause and len(temp) == 0:
                    for (w1, dep, w2) in list(parse.triples()):
                        if w1[1] == 'NN':
                            
                    
                if flag_o and len(temp) == 0:
                    other_clause = True

                
            if flag_cycle == 0:
                r_p.append(r)
                a_array = []
                tes = []
                try:
                    tes = df['aspect'][index].split(', ')
                except:
                    tes = []
                for x in tes:
                    a_array.append(x.split('[')[0])
            
                a_p.append('|'.join(a_array))
                t_a_p.append('|'.join(temp))
            else:
                if len(temp) != 0:
                    t_a_p[index] += '|' + '|'.join(temp)
            index += 1
            

        #calculate target and opinion expanded
        f = f + f_i 
        o_expanded = o_expanded + o_i

        
        #reread review, and run rule 3.1, 3.2, 2.1, and 2.2
        index = 0
        f_ii = []
        o_ii = []
        
        for sent in reviews:
            sentences = []
            clauses = get_clauses(sent)
            if len(clauses) == 0:
                sentences.append(sent)
            else:
                sentences = clauses
            
            for r in sentences:
                temp = []
                
                if using_chunking:
                    r = chunking(r)
                else:
                    r = r

                try:
                    parse = next(parser.raw_parse(r))
                except:
                    continue
                
                #Rule 3.1
                for (w1, dep, w2) in list(parse.triples()):
                    if(dep in conj_DP):
                        if(w1[0] in f_i): 
                            if w2[1] == 'NN' and w2[0] not in f:
                                f_ii.append(w2[0])
                                temp.append(w2[0])
                        elif(w2[0] in f_i):          
                            if w1[1] == 'NN' and w1[0] not in f:
                                f_ii.append(w1[0])
                                temp.append(w1[0])


                # Rule 3.2
                for (w1, dep, w2) in parse.triples():
                    if(dep in dep_DP or dep in conj_DP):
                        H = ''
                        O = ''
                        if w1[0] in f_i:
                            H = w2[0]
                            O = w1
                        elif w2[0] in f_i:
                            H = w1[0]
                            O = w2

                        if H:
                            for (w1, dep, w2) in list(parse.triples()):
                                if w1[0] == H and w2[0] != O[0]:
                                    if w2[1] == 'NN' and w2[0] not in f:
                                        f_ii.append(w2[0])  
                                        temp.append(w2[0])
                                elif w2[0] == H  and w1[0] != O[0]:
                                    if w1[1] == 'NN' and w1[0] not in f:
                                        f_ii.append(w1[0])
                                        temp.append(w1[0])


                # Rule 2.1
                for (w1, dep, w2) in list(parse.triples()):
                    if(dep in dep_DP):
                        if w1[0] in f_i:
                            if w2[1] == 'JJ' and w2[0] not in o_expanded:
                                o_ii.append(w2[0])

                        elif w2[0] in f_i:
                            if w1[1] == 'JJ' and w1[0] not in o_expanded:
                                o_ii.append(w1[0])


                # Rule 2.2
                for (w1, dep, w2) in parse.triples():
                    if(dep in dep_DP):
                        H = ''
                        O = ''
                        if w1[0] in f_i:
                            H = w2[0]
                            O = w1
                        elif w2[0] in f_i:
                            H = w1[0]
                            O = w2

                        if H:
                            for (w1, dep, w2) in list(parse.triples()):
                                if w1[0] == H and w2[0] != O[0]:
                                    if w2[1] == 'JJ' and w2[0] not in o_expanded:                  
                                        o_ii.append(w2[0])
                                elif w2[0] == H  and w1[0] != O[0]:
                                    if w1[1] == 'JJ' and w1[0] not in o_expanded:
                                        o_ii.append(w1[0])
                if len(temp) != 0:
                    t_a_p[index] += '|' + '|'.join(temp)

            index += 1
            
        f_i = f_i + f_ii
        o_i = o_i + o_ii
        f = f + f_ii
        o_expanded = o_expanded + o_ii     
        
        flag_cycle = 1
        
        if(len(f_i) == 0 and len(o_i) == 0):
            if save_to_file == True:
                out = pd.DataFrame(r_p)
                out['aspect'] = a_p
                out['prediction'] = t_a_p
                out.to_csv('hasil_dp.csv')
            is_stop = True
        
    return f, o_expanded

In [56]:
dp_aspect, opinion_expand = double_propagation(op_set, df['review'][34:35], True, False, True)

<Response [200]>
[(('satisfied', 'JJ'), 'nsubj', ('i', 'FW')), (('satisfied', 'JJ'), 'cop', ('am', 'VBP')), (('satisfied', 'JJ'), 'advmod', ('very', 'RB'))] i am very satisfied
[(('say', 'VB'), 'nsubj', ('i', 'LS')), (('say', 'VB'), 'aux', ('have', 'VBP')), (('say', 'VB'), 'mark', ('to', 'TO'))] i have to say 
[(('bought', 'VBD'), 'nsubj', ('i', 'LS')), (('bought', 'VBD'), 'dobj', ('canon-g3', 'NN')), (('canon-g3', 'NN'), 'nmod:poss', ('my', 'PRP$')), (('bought', 'VBD'), 'advmod', ('ago', 'RB')), (('ago', 'RB'), 'nmod:npmod', ('month', 'NN')), (('month', 'NN'), 'advmod', ('about', 'IN')), (('month', 'NN'), 'det', ('a', 'DT'))] i bought my canon-g3 about a month ago
<Response [200]>


In [34]:
get_clauses('i bought my canon g3 about a month ago and i have to say i am very satisfied . ')

<Response [200]>


['i bought my canon g3 about a month ago',
 'i have to say ',
 'i am very satisfied']

In [42]:
dp_aspect

[]

In [13]:
def calculate_precision_recall(aspect, target):
    tp = 0
    """      
    
    for a in aspect: 
        if a in target:
             tp += 1
   
    """
    for t in target: 
        for a in aspect:
            zz = t.split(' ')
            vv =0
            for z in zz:
                if z in a:
                    vv+=1
            if len(zz) == vv:
                tp+=1
                break
    
    P = (tp * 1.0) / (len(aspect) * 1.0)
    R = (tp * 1.0) / (len(target) * 1.0)
    
    f1 = 2.0 * P * R / (P+R)
    
    return P, R, f1

In [14]:
import re
import math
import numpy as np
target = []

for t in df['aspect'][0:597]:
    if t is not np.nan:
        for s in t.split(', '):
            for x in s.split(','):
                jj = re.sub(r'\[[+|-]\d\]', '',x)
                jjs = re.sub(r'\[\w\]', '',jj)
                if(jjs):
                    target.append(jjs);
dp_aspect =  list(map(lambda aspect: ' '.join(aspect.split('-')), dp_aspect))               
candidate_aspect = list(map(lambda aspect: ' '.join(aspect.split('-')), candidate_aspect))


# candidate_aspect = list(map(lambda aspect: aspect), candidate_aspect))

In [17]:
calculate_precision_recall(list(dict.fromkeys(candidate_aspect)), list(dict.fromkeys(target)))

ZeroDivisionError: float division by zero