In [1]:
import sys, os
import re
import pandas as pd
import itertools, nltk, string 
from nltk.parse.corenlp import CoreNLPDependencyParser

parser = CoreNLPDependencyParser()

def read_file(file):
    f = open(file, 'r')
    pattern_title = '\[t\]'
    pattern_sentence = '(?<=##).+'
    pattern_aspect = '.+(?=##)'
    review = []
    for a in f:
        if re.search('##', a):
            sentence = re.findall(pattern_sentence, a)[0]
            aspect = re.findall(pattern_aspect, a)
            flag = True
            if len(aspect) > 0:
                aspect = aspect[0]
            else:
                aspect = ''
            if flag:
                review.append((sentence, aspect))
    df = pd.DataFrame(columns=['review','target'])
    for r in review:
        df = df.append({'review': r[0], 'target': r[1]}, ignore_index=True)
    return df
        
positive_lexicon = []
negative_lexicon = []

def read_lexicon():
    global positive_lexicon;
    global negative_lexicon;    
    with open(os.path.join(os.path.abspath('opinion-lexicon-English/') , 'positive-words.txt'), 'r') as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
        positive_lexicon = file.readlines()
    with open(os.path.join(os.path.abspath('opinion-lexicon-English/') , 'negative-words.txt'), 'r', encoding = "ISO-8859-1") as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
        negative_lexicon = file.readlines()
    positive_lexicon = list(map(lambda word: word.rstrip("\n\r"), positive_lexicon))
    negative_lexicon = list(map(lambda word: word.rstrip("\n\r"), negative_lexicon))
        
read_lexicon()
df = read_file('dataset/bing_liu/Nokia 6610.txt')

import os
os.environ["CORENLP_HOME"] = r'C:\stanford-corenlp-full-2018-10-05'

import corenlp 
client = corenlp.CoreNLPClient()

def preprocess(sentences, chunk = False):
    res = []
    for sentence in sentences:
        text = sentence.lower()
        if chunk:
            try:
#                 pattern = '[{pos:NN}][{pos:NN}][{pos:NN}] | [{pos:NN}][{pos:NN}]'
                pattern = '[{pos:JJ}][{pos:NN}]'
                matches = client.tokensregex(sentence, pattern)
                response = matches['sentences'][0]
                len_chunk = response['length']
                if len_chunk > 0:
                    for index in range(0, len_chunk):
                        replacer = response[str(index)]['text']
                        print(replacer)
                        text = text.replace(replacer, '-'.join(replacer.split(' ')))
                
            except:
                text = text
        res.append(text)
    return res

import requests
import re

In [2]:
NN = ['NN', 'NNS', 'NNP', 'NNPS']
JJ = ['JJ', 'JJR', 'JJS']

def rule_1_1(parse, f, o_expanded):
    #Rule 1.1
#     print(dep_DP)
    for (w1, dep, w2) in list(parse.triples()):
        if(dep in dep_DP):
            # Rule 1.1
            if(w1[0] in o_expanded):
                if w2[1] in NN and w2[0] not in f:
                    return True, w2[0], w1[0]
            elif(w2[0] in o_expanded):
                if w1[1] in NN and w1[0] not in f:
                    return True, w1[0], w2[0]
    return False, None, None

def rule_1_2(parse, f, o_expanded):
    # Rule 1.2
    for (w1, dep, w2) in parse.triples():
        if(dep in dep_DP):
            H = ''
            O = ''
            if w1[0] in o_expanded:
                H = w2[0]
                O = w1
            elif w2[0] in o_expanded:
                H = w1[0]
                O = w2

            if H:
                for (w1, dep, w2) in list(parse.triples()):
                    if w1[0] == H and w2[0] != O[0]:
                        if w2[1] in NN and w2[0] not in f:
                            return True, w2[0], O[0]
                    elif w2[0] == H  and w1[0] != O[0]:
                        if w1[1] in NN and w1[0] not in f:
                            return True, w1[0], O[0]
    return False, None, None

def rule_4_1(parse, f, o_expanded):
    # Rule 4.1
    for (w1, dep, w2) in list(parse.triples()):
        if(dep in conj_DP):
            if w1[0] in o_expanded:
                if w2[1] in JJ and w2[0] not in o_expanded:
                    return True, w2[0]

            elif w2[0] in o_expanded:
                if w1[1] in JJ and w1[0] not in o_expanded:
                    return True, w1[0]

    return False, None

def rule_4_2(parse, f, o_expanded):
    # Rule 4.2
    for (w1, dep, w2) in parse.triples():
        if(dep in dep_DP or dep in conj_DP):
            H = ''
            O = ''
            if w1[0] in o_expanded:
                H = w2[0]
                O = w1
            elif w2[0] in o_expanded:
                H = w1[0]
                O = w2

            if H:
                for (w1, dep, w2) in list(parse.triples()):
                    if w1[0] == H and w2[0] != O[0]:
                        if w2[1] in JJ and w2[0] not in o_expanded:
                            return True, w2[0]

                    elif w2[0] == H  and w1[0] != O[0]:
                        if w1[1] in JJ and w1[0] not in o_expanded:
                            return True, w1[0]
    
    return False, None

def rule_3_1(parse, f):
    #Rule 3.1
    for (w1, dep, w2) in list(parse.triples()):
        if(dep in conj_DP):
            if(w1[0] in f): 
                if w2[1] in NN and w2[0] not in f:
                    return True, w2[0], w1[0]
            elif(w2[0] in f):          
                if w1[1] in NN and w1[0] not in f:
                    return True, w1[0], w2[0]
                
    return False, None, None

def rule_3_2(parse, f):
    # Rule 3.2
    for (w1, dep, w2) in parse.triples():
        if(dep in dep_DP or dep in conj_DP):
            H = ''
            O = ''
            if w1[0] in f:
                H = w2[0]
                O = w1
            elif w2[0] in f:
                H = w1[0]
                O = w2

            if H:
                for (w1, dep, w2) in list(parse.triples()):
                    if w1[0] == H and w2[0] != O[0]:
                        if w2[1] in NN and w2[0] not in f:
                            return True, w2[0], O[0]
                    elif w2[0] == H  and w1[0] != O[0]:
                        if w1[1] in NN and w1[0] not in f:
                            return True, w1[0], O[0]

    return False, None, None

def rule_2_1(parse, f, o_expanded):
    # Rule 2.1
    for (w1, dep, w2) in list(parse.triples()):
        if(dep in dep_DP):
            if w1[0] in f:
                if w2[1] in JJ and w2[0] not in o_expanded:
                    return True, w2[0]

            elif w2[0] in f:
                if w1[1] in JJ and w1[0] not in o_expanded:
                    return True, w1[0]
                
    return False, None

def rule_2_2(parse, f, o_expanded):      
    # Rule 2.2
    for (w1, dep, w2) in parse.triples():
        if(dep in dep_DP):
            H = ''
            O = ''
            if w1[0] in f:
                H = w2[0]
                O = w1
            elif w2[0] in f:
                H = w1[0]
                O = w2

            if H:
                for (w1, dep, w2) in list(parse.triples()):
                    if w1[0] == H and w2[0] != O[0]:
                        if w2[1] in JJ and w2[0] not in o_expanded:    
                            return True, w2[0]
                    elif w2[0] == H  and w1[0] != O[0]:
                        if w1[1] in JJ and w1[0] not in o_expanded:
                            return True, w1[0]

    return False, None

In [3]:
import pandas as pd
candidate_aspect = []
new_opinion = []
op_set = positive_lexicon + negative_lexicon

def double_propagation(O: op_set, reviews, targets, output_file, save_to_file = False):
    o_expanded = O
    f = []
    is_stop = False
    flag_cycle = 0
    
    t_a_p = []
    pair_a_o = []
    a_p = []
    r_p = []
    
    while (not is_stop):
        f_i = []
        o_i = []
    
        index = 0

        for sentence in reviews:
            aspect_sentence = []
            if flag_cycle:
                aspect_sentence = t_a_p[index].split('|')
                
            temp = []
            pair = []
                
            try:
                parse = next(parser.raw_parse(sentence))
            except:
                continue

            # Rule 1.1
            is_true, t_a, o = rule_1_1(parse, aspect_sentence, o_expanded)
            if is_true:
                f_i.append(t_a)
                temp.append(t_a)
                pair.append(t_a + '!' + o)

            # Rule 1.2
            is_true, t_a, o = rule_1_2(parse, aspect_sentence, o_expanded)
            if is_true:
                f_i.append(t_a)
                temp.append(t_a)
                pair.append(t_a + '!' + o)

            # Rule 4.1
            is_true, t_o = rule_4_1(parse, f, o_expanded)
            if is_true:
                o_i.append(t_o)

            # Rule 4.2
            is_true, t_o = rule_4_2(parse, f, o_expanded)
            if is_true:
                o_i.append(t_o)


            if flag_cycle == 0:
                r_p.append(sentence)
                a_array = []
                tes = []
                try:
                    tes = targets[index].split(', ')
                except:
                    tes = []
                for x in tes:
                    splitted = x.split('[')
                    if len(splitted) > 1:
                        a_array.append(splitted[0] + '!' + splitted[1][0])
                    else:
                        a_array.append(splitted[0])

                a_p.append('|'.join(a_array))
                t_a_p.append('|'.join(temp))
                pair_a_o.append('|'.join(pair))
            else:
                if len(temp) != 0:
                    t_a_p[index] += '|' + '|'.join(temp)
                    pair_a_o[index] += '|' + '|'.join(pair)
            index += 1
            

        #calculate target and opinion expanded
        f = f + f_i 
        o_expanded = o_expanded + o_i

        
        #reread review, and run rule 3.1, 3.2, 2.1, and 2.2
        index = 0
        f_ii = []
        o_ii = []
        
        for sentence in reviews:
            
            aspect_sentence = []
            if flag_cycle:
                aspect_sentence = t_a_p[index].split('|')
                
            temp = []
            pair = []
                
            try:
                parse = next(parser.raw_parse(sentence))
            except:
                continue

            # Rule 3.1
            is_true, t_a, o = rule_3_1(parse, aspect_sentence)
            if is_true:
                f_ii.append(t_a)
                temp.append(t_a)
                pair.append(t_a + '!' + o)

            # Rule 3.2
            is_true, t_a, o = rule_3_2(parse, aspect_sentence)
            if is_true:
                f_ii.append(t_a)
                temp.append(t_a)
                pair.append(t_a + '!' + o)


            # Rule 2.1
            is_true, t_o = rule_2_1(parse, f, o_expanded)
            if is_true:
                o_ii.append(t_o)

            # Rule 2.2
            is_true, t_o = rule_2_2(parse, f, o_expanded)
            if is_true:
                o_ii.append(t_o)

            if len(temp) != 0:
                t_a_p[index] += '|' + '|'.join(temp)
                pair_a_o[index] += '|' + '|'.join(pair)

            index += 1
            
        f_i = f_i + f_ii
        o_i = o_i + o_ii
        f = f + f_ii
        o_expanded = o_expanded + o_ii     
        
        flag_cycle = 1
        
        if(len(f_i) == 0 and len(o_i) == 0):
            if save_to_file == True:
                out = pd.DataFrame(r_p)
                out['aspect'] = a_p
                out['prediction'] = pair_a_o
                out.to_csv(output_file)
            is_stop = True

    return f, o_expanded

In [4]:
def calculate_frequency(aspects):
    aspect_frequency = {}
    
    for aspect in aspects:
        if(aspect in aspect_frequency):
            aspect_frequency[aspect] += 1
        else:
            aspect_frequency[aspect] = 1
            
    return aspect_frequency

k_DP = ['conj', 'compound']
def pruning_based_on_clause(aspect_frequency, reviews, predictions):
    pruning = []
    index = 0
    for review in reviews:
        temp = predictions[index]
        index += 1
        if isinstance(temp, str):
            prediction = temp.split('|')
        else:
            continue
        parse = next(parser.raw_parse(review))
        for (w1, dep, w2) in list(parse.triples()):
            if(w1[0] in aspect_frequency and w2[0] in aspect_frequency):
                if(dep not in k_DP):
                    if(aspect_frequency[w1[0]] > aspect_frequency[w2[0]]):
                        if w2[0] in prediction:
                            pruning.append(w2[0])
                    elif(aspect_frequency[w1[0]] < aspect_frequency[w2[0]]):
                        if w1[0] in prediction:
                            pruning.append(w1[0])
                
    return pruning
def pruning_based_other_products_and_dealers(aspect_frequency, reviews,predictions, window=3):
    pruning = []
    ProductINDI = ["compare to", "compare with", "better than", "worse than"]
    DealerINDI  = ["shop with", "buy from"]
    count = 0
    for review in reviews:
        temp = predictions[count]
        count += 1
        if isinstance(temp, str):
            prediction = temp.split('|')
        else:
            continue
        if any(indication in review for indication in ProductINDI):
            tokens = nltk.word_tokenize(review)
            index = 0
            while index < len(tokens) - 1:
                if tokens[index] + " " + tokens[index + 1] in ProductINDI:
                    index += 2
                    for x in range(window):
                        next_window = index + x + 1;
                        if next_window < len(tokens) and tokens[next_window] in aspect_frequency:
                            if tokens[next_window] in prediction:
                                pruning.append(tokens[next_window])
                else :
                    index += 1
                    
        if any(indication in review for indication in DealerINDI):
            tokens = nltk.word_tokenize(review)
            index = 0
            while index < len(tokens) - 1:
                if tokens[index] + " " + tokens[index + 1] in DealerINDI:
                    index += 2
                    for x in range(window):
                        next_window = index + x + 1;
                        if next_window < len(tokens) and tokens[next_window] in aspect_frequency:
                            if tokens[next_window] in prediction:
                                pruning.append(tokens[next_window])
                else :
                    index += 1 
                        
    return pruning

In [5]:
import pandas as pd
import re
import math
import numpy as np

def run(filename, save_file, chunking, path):
    df = read_file(filename)
        
    dp_aspect, opinion_expand = double_propagation(op_set, preprocess(df['review'], chunking),
                                                   df['target'], path + save_file, False)

In [6]:
import pandas as pd

filename = [
#             'Apex AD2600 Progressive-scan DVD player',
#             'Canon G3',
#             'Creative Labs Nomad Jukebox Zen Xtra 40GB',
#             'Nikon coolpix 4300',
            'Nokia 6610'
]

conj_DP = ['conj']
dep_DP = ['amod', 'prep', 'nsubj', 'csubj', 'xsubj', 'dobj', 'iobj']

for file in filename:
    print(file)
#     run('dataset/bing_liu/' + file + '.txt', file + '.csv', False, 'hasil/')
    run('dataset/bing_liu/' + file + '.txt', file + '.csv', True, 'hasil_chunk/')


# dep_DP = ['amod', 'prep', 'nsubj', 'csubj', 'xsubj', 'dobj', 'iobj',
#           'advmod', 'dep', 'cop', 'mark', 'nsubjpass', 'pobj', 'acomp', 'xcomp', 'csubjpass', 'poss']
# for file in filename:
#     print(file)
#     run('dataset/bing_liu/' + file + '.txt', file + '.csv', False, 'hasil_plus/')
#     run('dataset/bing_liu/' + file + '.txt', file + '.csv', True, 'hasil_chunk_plus/')

Nokia 6610
mobile service
great phone
double check
recent road
northern kentucky
perfect signal
same route
only feature
previous nokia
old phone
t-mobile service
horrible customer
brief synopsis
personal information
good sound
little memory
full screen
excellent hearing
other person
amazing battery
exceptional service
annoying habit
same interaction
good csr
clear line
expensive sanyo
prepaid telephone
apparent indestructibility
other color
personal touch
nifty phone
brisk jog
flip phone
frequent signal
great care
teeny phone
big mechanism
favorite phone
great battery
perfect size
nice color
major minus
good server
nice phone
broken headphone
much help
t-mobile store
unchangeable email
online tech
same problem
half hour
sound quality
excellent fm
poor visibility
superb reception
big deal
useful feature
other end
decent reception
normal radio
beneficial feature
only problem
small glitch
full bar
international phone
good chance
rare occasion
t-mobile store
european phone
european company