Read dataset

In [17]:
import sys, os
import re
import pandas as pd
import itertools, nltk, string 
#from transforms import flatten_deeptree

rx_dict = {
    'title': re.compile(r'\[t\](?P<title>.*)'),
    'review': re.compile(r'(?P<aspect>.*)##(?P<review>.*)')
}

def parse_data(file, data, reviews=[], aspects=[]):
    line = file.readline();
    if(line):
        match_title = rx_dict['title'].search(line);
        if match_title:
            data['title'].append(match_title.group('title'))
            data['domain'].append('canon g3')
            if(len(reviews) > 0 or len(aspects) > 0):
                data['review'].append("".join(reviews))
                data['aspect'].append(", ".join(aspects))
                reviews = []                
        
        match_review = rx_dict['review'].search(line)
        if match_review:
            review_text = match_review.group('review');
            aspect_text = match_review.group('aspect');
            
            if(review_text):
                reviews.append(review_text);
            
            if(aspect_text):
                aspects.append(aspect_text);
                
        parse_data(file, data, reviews, aspects)
    else:
        if(len(reviews) > 0 or len(aspects) > 0):
            data['review'].append("".join(reviews))
            data['aspect'].append(", ".join(aspects))
    
data = {
    'title': [],
    'review': [],
    'aspect': [],
    'domain': []
}
    
def read_file():
    with open(os.path.join(os.path.abspath('dataset/bing_liu/') , 'Apex AD2600 Progressive-scan DVD player.txt'), 'r') as file:
        parse_data(file, data);
        
positive_lexicon = []
negative_lexicon = []

def read_lexicon():
    global positive_lexicon;
    global negative_lexicon;
    
    with open(os.path.join(os.path.abspath('opinion-lexicon-English/') , 'positive-words.txt'), 'r') as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
         
        positive_lexicon = file.readlines()
    
    with open(os.path.join(os.path.abspath('opinion-lexicon-English/') , 'negative-words.txt'), 'r', encoding = "ISO-8859-1") as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
        
        negative_lexicon = file.readlines()
        
    positive_lexicon = list(map(lambda word: word.rstrip("\n\r"), positive_lexicon))
    negative_lexicon = list(map(lambda word: word.rstrip("\n\r"), negative_lexicon))
    
        
read_file()
read_lexicon()

Print dataset with pandas

In [18]:
xdata = pd.DataFrame(data)
xdata.head()

Unnamed: 0,aspect,domain,review,title
0,"i/p button[+2], dvd player[+2], dvd player[+1]...",canon g3,"repost from january 13 , 2004 with a better fi...",troubleshooting ad-2500 and ad-2600 no pictur...
1,"i/p button[+2], dvd player[+2], dvd player[+1]...",canon g3,i 've owned 6 or 7 dvd players since 1998 . th...,incredibe price / performance .
2,"i/p button[+2], dvd player[+2], dvd player[+1]...",canon g3,many of our disney movies do n't play on this ...,doesnt play new disney movies .
3,"i/p button[+2], dvd player[+2], dvd player[+1]...",canon g3,player has a problem with dual-layer dvd 's su...,poor quality - problem with dual-layer dvd 's .
4,"i/p button[+2], dvd player[+2], dvd player[+1]...",canon g3,"for the first few weeks , this player was ever...",has destroyed several of my dvds and cds .


Loading Stanford CoreNLP

In [19]:
from nltk.parse.corenlp import CoreNLPDependencyParser

parser = CoreNLPDependencyParser()
parse = next(parser.raw_parse("my name is khan"))

In [20]:
list(parse.triples())

[(('khan', 'JJ'), 'nsubj', ('name', 'NN')),
 (('name', 'NN'), 'nmod:poss', ('my', 'PRP$')),
 (('khan', 'JJ'), 'cop', ('is', 'VBZ'))]

In [21]:
print(parse.to_conll(4))

my	PRP$	2	nmod:poss
name	NN	4	nsubj
is	VBZ	4	cop
khan	JJ	0	ROOT



In [22]:
xdata.iloc[0]['review']

"repost from january 13 , 2004 with a better fit title . does your apex dvd player only play dvd audio without video ? or does it play audio and video but scrolling in black and white ? before you try to return the player or waste hours calling apex tech support , or run the player over with your car , try these simple troubleshooting ideas first . no picture : hopefully you still have the remote control . if you tossed it out the window , you need to fetch it . using the remote control , press the i/p button located on the bottom right corner of the remote . the i/p button switches the tv display between interlace and progressive . if this doesnt bring back the picture , try pressing this button without playing a dvd . if you dont get video back , now you can run the player over with your car ! picture scrolling in b/w : you need the remote control for this so you better get it from your dog before he burries it in the backyard . press the p/n button located on the bottom right corner

In [23]:
def extract_candidate_chunks(text, grammar=r'''NP: {<NN.*><JJ.*>?<PRP.*>?<NN>+} 
                                            ...AP: {<JJ.*><.*>?<VB.*>+}'''):
    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents=[]
    for sent in nltk.sent_tokenize(text):
        tagged_sents.append(nltk.pos_tag(nltk.word_tokenize(sent)))
    
    #print(tagged_sents)
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda chunk: chunk[2] != 'O') if key]

    return [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]

print(extract_candidate_chunks(xdata.iloc[0]['review'], r'''NP: {<NN.*><JJ.*>?<PRP.*>?<NN>+}'''))
print(extract_candidate_chunks(xdata.iloc[0]['review'], r'''AP: {<JJ.*><.*>?<VB.*>+}'''))

for c in extract_candidate_chunks(xdata.iloc[0]['review'], r'''NP: {<NN.*><JJ.*>?<PRP.*>?<NN>+}'''):
        parse = next(parser.raw_parse(c))
        print(list(parse.triples()))

['apex dvd player', 'dvd audio', 'apex tech support', 'tv display', 'output signal', 'apex dvd player', 'apex dvd player', 'wwhhhrrr sound']
['i/p button located', 'i/p button switches', 'better get', 'p/n button located', 'p/n button switches', 'able to get', 'other problems go', 'apex ad-2500 seemed', 'i love']
[(('player', 'NN'), 'compound', ('apex', 'NN')), (('player', 'NN'), 'compound', ('dvd', 'NN'))]
[(('audio', 'NN'), 'compound', ('dvd', 'NN'))]
[(('support', 'NN'), 'compound', ('apex', 'NN')), (('support', 'NN'), 'compound', ('tech', 'NN'))]
[(('display', 'NN'), 'compound', ('tv', 'NN'))]
[(('signal', 'NN'), 'compound', ('output', 'NN'))]
[(('player', 'NN'), 'compound', ('apex', 'NN')), (('player', 'NN'), 'compound', ('dvd', 'NN'))]
[(('player', 'NN'), 'compound', ('apex', 'NN')), (('player', 'NN'), 'compound', ('dvd', 'NN'))]
[(('sound', 'NN'), 'compound', ('wwhhhrrr', 'NN'))]


## Double Propagation

### Rule 1.1 if a word A, whose POS is NN, is depended by an opinion word O through Dep, where Dep is one of the dependency relations amod, prep, nsubj, csubj, xsubj, dobj, and iobj, then A is an aspect.

### Rule 1.2 if an opinion word O and a word A, whose POS is NN, depend on a third word H through dependency relations Depi and Depj respectively, where Depi and Dep j are one of the relations amod, prep, nsubj, csubj, xsubj, dobj, and iobj, then A is an aspect.

### Rule 3.1 if a word Aj, whose POS is NN, directly depends on an aspect Ai through conj, then A j is an aspect

### Rule 3.2 if a word Aj, whose POS is NN, and an aspect Ai, directly depend on a third word H through the dependency relations De pi and De p j , where De pi and Depj are one of the relations in amod, prep, nsubj, csubj, xsubj, dobj, and conj, then A j is an aspect


### Rule 4.1 if a word Oj, whose POS is JJ, directly depends on an opinion word Oi through conj, then O j is an opinion word. 

### Rule 4.2 if a word Oj, whose POS is JJ, and an opinion word Oi, directly depend on a third word H through the dependance relations De pi and De p j , where Dep1 and Dep2 are one of the relations in amod, prep, nsubj, csubj, xsubj, dobj, and conj, then O j is an opinion word

In [24]:
# adding new dependencies for DP 'nmod' and'advmod'
dep_DP = ['amod', 'prep', 'nsubj', 'csubj', 'xsubj', 'dobj', 'iobj', 'advmod']
conj_DP = ['conj']

In [25]:
candidate_aspect = []
new_opinion = []
op_set = positive_lexicon + negative_lexicon


for text in xdata['review']:
    chunking_noun = extract_candidate_chunks(text, r'NP: {<NN.*><JJ.*>?<PRP.*>?<NN>+}') 
    chunking_adj = extract_candidate_chunks(text, r'AP: {<JJ.*><.*>?<VB.*>+}') 
   
    new_chunking = []
    for c in chunking_noun:
        parse = next(parser.raw_parse(c))
        for (w1, dep, w2) in list(parse.triples()):
            if(dep == 'compound'):
                new_chunking.append(c)
    
    for c in chunking_adj:
        new_chunking.append(c)
    
    for chunk in new_chunking:
        text = text.replace(chunk, '-'.join(chunk.split(' ')))

    for r in nltk.sent_tokenize(text):
        parse = next(parser.raw_parse(r))
        #Rule 1.1
        for (w1, dep, w2) in list(parse.triples()):
            if(dep in dep_DP):
                # Rule 1.1
                if(w1[0] in op_set): 
                    if w2[1] == 'NN':
                        candidate_aspect.append(w2[0])
                elif(w2[0] in op_set):          
                    if w1[1] == 'NN':
                        candidate_aspect.append(w1[0])


        # Rule 1.2
        for (w1, dep, w2) in parse.triples():
            if(dep in dep_DP):
                H = ''
                O = ''
                if w1[0] in op_set:
                    H = w2[0]
                    O = w1
                elif w2[0] in op_set:
                    H = w1[0]
                    O = w2

                if H:
                    for (w1, dep, w2) in list(parse.triples()):
                        if w1[0] == H and w2[0] != O[0]:
                            if w2[1] == 'NN':                  
                                candidate_aspect.append(w2[0])
                        elif w2[0] == H  and w1[0] != O[0]:
                            if w1[1] == 'NN':
                                candidate_aspect.append(w1[0])


        # Rule 4.1
        for (w1, dep, w2) in list(parse.triples()):
            if(dep in conj_DP):
                if w1[0] in op_set:
                    if w2[1] == 'JJ':
                        new_opinion.append((w2, w1))
                        op_set.append(w2[0])

                elif w2[0] in op_set:
                    if w1[1] == 'JJ':
                        new_opinion.append((w1, w2))
                        op_set.append(w1[0])


        # Rule 4.2
        for (w1, dep, w2) in parse.triples():
            if(dep in dep_DP or dep in conj_DP):
                H = ''
                O = ''
                if w1[0] in op_set:
                    H = w2[0]
                    O = w1
                elif w2[0] in op_set:
                    H = w1[0]
                    O = w2

                if H:
                    for (w1, dep, w2) in list(parse.triples()):
                        if w1[0] == H and w2[0] != O[0]:
                            if w2[1] == 'JJ':                  
                                new_opinion.append((w2, w1))
                                op_set.append(w2[0])
                        elif w2[0] == H  and w1[0] != O[0]:
                            if w1[1] == 'JJ':
                                new_opinion.append((w2, w1))
                                op_set.append(w1[0])


    for r in nltk.sent_tokenize(text):
        parse = next(parser.raw_parse(r))
        #Rule 3.1
        for (w1, dep, w2) in list(parse.triples()):
            if(dep in conj_DP):
                if(w1[0] in candidate_aspect): 
                    if w2[1] == 'NN':
                        candidate_aspect.append(w2[0])
                elif(w2[0] in candidate_aspect):          
                    if w1[1] == 'NN':
                        candidate_aspect.append(w1[0])


        # Rule 3.2
        for (w1, dep, w2) in parse.triples():
            if(dep in dep_DP or dep in conj_DP):
                H = ''
                O = ''
                if w1[0] in candidate_aspect:
                    H = w2[0]
                    O = w1
                elif w2[0] in candidate_aspect:
                    H = w1[0]
                    O = w2

                if H:
                    for (w1, dep, w2) in list(parse.triples()):
                        if w1[0] == H and w2[0] != O[0]:
                            if w2[1] == 'NN':                  
                                candidate_aspect.append(w2[0])
                        elif w2[0] == H  and w1[0] != O[0]:
                            if w1[1] == 'NN':
                                candidate_aspect.append(w1[0])


        # Rule 4.1
        for (w1, dep, w2) in list(parse.triples()):
            if(dep in conj_DP):
                if w1[0] in op_set:
                    if w2[1] == 'JJ':
                        new_opinion.append((w2, w1))
                        op_set.append(w2[0])

                elif w2[0] in op_set:
                    if w1[1] == 'JJ':
                        new_opinion.append((w1, w2))
                        op_set.append(w1[0])


        # Rule 4.2
        for (w1, dep, w2) in parse.triples():
            if(dep in dep_DP or dep in conj_DP):
                H = ''
                O = ''
                if w1[0] in op_set:
                    H = w2[0]
                    O = w1
                elif w2[0] in op_set:
                    H = w1[0]
                    O = w2

                if H:
                    for (w1, dep, w2) in list(parse.triples()):
                        if w1[0] == H and w2[0] != O[0]:
                            if w2[1] == 'JJ':                  
                                new_opinion.append((w2, w1))
                                op_set.append(w2[0])
                        elif w2[0] == H  and w1[0] != O[0]:
                            if w1[1] == 'JJ':
                                new_opinion.append((w2, w1))
                                op_set.append(w1[0])

In [26]:
def calculate_precision_recall(aspect, target):
    tp = 0
    for a in aspect: 
        if a in target:
            tp += 1
            
    P = (tp * 1.0) / (len(aspect) * 1.0)
    R = (tp * 1.0) / (len(target) * 1.0)
    
    f1 = 2.0 * P * R / (P+R)
    
    return P, R, f1

In [27]:
import re

target = []
for t in xdata['aspect']:
    for s in t.split(', '):
        for x in s.split(','):
            jj = re.sub(r'\[[+|-]\d\]', '',x)
            jjs = re.sub(r'\[\w\]', '',jj)
            if(jjs):
                target.append(jjs);
candidate_aspect = list(map(lambda aspect: ' '.join(aspect.split('-')), candidate_aspect))

In [28]:
calculate_precision_recall(list(dict.fromkeys(candidate_aspect)), list(dict.fromkeys(target)))

(0.16723549488054607, 0.4224137931034483, 0.23960880195599024)

In [29]:
calculate_precision_recall(candidate_aspect, target)

(0.36657303370786515, 0.011832441744491794, 0.02292490118577075)