Read dataset

In [2]:
import sys, os
import re
import pandas as pd
import itertools, nltk, string 
nltk.download('wordnet')
#from transforms import flatten_deeptree

rx_dict = {
    'title': re.compile(r'\[t\](?P<title>.*)'),
    'review': re.compile(r'(?P<aspect>.*)##(?P<review>.*)')
}

def parse_data(file, data, reviews=[], aspects=[]):
    line = file.readline();
    if(line):
        match_title = rx_dict['title'].search(line);
        if match_title:
            data['title'].append(match_title.group('title'))
            data['domain'].append('canon g3')
            if(len(reviews) > 0 or len(aspects) > 0):
                data['review'].append("".join(reviews))
                data['aspect'].append(", ".join(aspects))
                reviews = []
        
        match_review = rx_dict['review'].search(line)
        if match_review:
            review_text = match_review.group('review');
            aspect_text = match_review.group('aspect');
            
            if(review_text):
                reviews.append(review_text);
            
            if(aspect_text):
                aspects.append(aspect_text);
                
        parse_data(file, data, reviews, aspects)
    else:
        if(len(reviews) > 0 or len(aspects) > 0):
            data['review'].append("".join(reviews))
            data['aspect'].append(", ".join(aspects))
    
data = {
    'title': [],
    'review': [],
    'aspect': [],
    'domain': []
}
    
def read_file():
    with open(os.path.join(os.path.abspath('dataset/bing_liu/') , 'Canon_G3.txt'), 'r') as file:
        parse_data(file, data);
        
positive_lexicon = []
negative_lexicon = []

def read_lexicon():
    global positive_lexicon;
    global negative_lexicon;
    
    with open(os.path.join(os.path.abspath('opinion-lexicon-English/') , 'positive-words.txt'), 'r') as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
         
        positive_lexicon = file.readlines()
    
    with open(os.path.join(os.path.abspath('opinion-lexicon-English/') , 'negative-words.txt'), 'r', encoding = "ISO-8859-1") as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
        
        negative_lexicon = file.readlines()
        
    positive_lexicon = list(map(lambda word: word.rstrip("\n\r"), positive_lexicon))
    negative_lexicon = list(map(lambda word: word.rstrip("\n\r"), negative_lexicon))
    
        
read_file()
read_lexicon()

[nltk_data] Downloading package wordnet to C:\Users\Farza
[nltk_data]     Nurifan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Print dataset with pandas

In [3]:
xdata = pd.DataFrame(data)
len(xdata)

45

In [4]:
import os
os.environ["CORENLP_HOME"] = r'C:\stanford-corenlp-full-2018-10-05'

import corenlp 
client = corenlp.CoreNLPClient()

def chunk_check(text, word):
    try:
        pattern = '{tag:/NN.*/} <compound {word:'+ word +';tag:/NN.*/}'
        matches = client.semgrex(text, pattern)
        res = matches['sentences']
        if len(res) == 1:
            if res[0]['length'] == 0:
                return word
        return res[0]['0']['text'] + ' ' + word
    except:
        return word

Loading Stanford CoreNLP

In [5]:
from pycorenlp import StanfordCoreNLP
import json

nlp = StanfordCoreNLP('http://localhost:9000')
dependency_parser = nlp.annotate

def entity_check(sentence):
    res = sentence
    result = dependency_parser(sentence, properties={"outputFormat": "json", "annotators": "openie"})
    if len(result['sentences'][0]['openie']) != 0:
        s = result['sentences'][0]['openie'][0]['subject']
        o = result['sentences'][0]['openie'][0]['object']
        res = res.replace(s, '!'.join(s.split(' ')))
        res = res.replace(o, '!'.join(o.split(' ')))
    return res

entity_check('i recently purchased the canon powershot g3 and am extremely satisfied with the purchase .')

'i recently purchased the canon!powershot!g3 and am extremely satisfied with the purchase .'

In [6]:
from nltk.parse.corenlp import CoreNLPDependencyParser

parser = CoreNLPDependencyParser()
parse = next(parser.raw_parse("my name is khan"))

In [7]:
list(parse.triples())

[(('khan', 'JJ'), 'nsubj', ('name', 'NN')),
 (('name', 'NN'), 'nmod:poss', ('my', 'PRP$')),
 (('khan', 'JJ'), 'cop', ('is', 'VBZ'))]

In [8]:
print(parse.to_conll(4))

my	PRP$	2	nmod:poss
name	NN	4	nsubj
is	VBZ	4	cop
khan	JJ	0	ROOT



In [9]:
xdata.iloc[0]['review']

"i recently purchased the canon powershot g3 and am extremely satisfied with the purchase . the camera is very easy to use , in fact on a recent trip this past week i was asked to take a picture of a vacationing elderly group . after i took their picture with their camera , they offered to take a picture of us . i just told them , press halfway , wait for the box to turn green and press the rest of the way . they fired away and the picture turned out quite nicely . ( as all of my pictures have thusfar ) . a few of my work constituants owned the g2 and highly recommended the canon for picture quality . i 'm easily enlarging pictures to 8 1/2 x 11 with no visable loss in picture quality and not even using the best possible setting as yet ( super fine ) . ensure you get a larger flash , 128 or 256 , some are selling with the larger flash , 32mb will do in a pinch but you 'll quickly want a larger flash card as with any of the 4mp cameras . bottom line , well made camera , easy to use , ve

In [10]:
def extract_candidate_chunks(text, grammar=r'''NP: {<NN.*><JJ>?<IN>?<PRP.*>?<NN.*>} 
                                            ...AP: {<JJ.*><.*>?<VB.*>+}'''):
    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents=[]
    for sent in nltk.sent_tokenize(text):
        tagged_sents.append(nltk.pos_tag(nltk.word_tokenize(sent)))
    
    #print(tagged_sents)
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda chunk: chunk[2] != 'O') if key]

    return [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]


In [58]:
import re
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import wordnet as wn
# from IPython.display import clear_outputfrom 
from nltk.stem import PorterStemmer
porter = PorterStemmer()
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def preprocessing(semua_kalimat):
    i=0
    kalimat_semua = []
    panjang = len(semua_kalimat) - 1
    for sentence in semua_kalimat:
        kalimat = []
        for word in nltk.word_tokenize(sentence):
            kata = wordnet_lemmatizer.lemmatize(word)
            if re.match(r'^[0-9]+$', kata) != None:
                kata = 'Num'
            kalimat.append(kata)
        kalimat_semua.append(' '.join(kalimat))
#         print(sentence)        
#         clear_output(wait=True)
#         print((i/panjang)*100,"%")
        i+=1
    return kalimat_semua

# input file
df = pd.read_csv("dataset.csv")
# preprocess
semkal = preprocessing(df['review'])
labels = df["target"]

In [59]:
# settings tf-idf
tfidf = TfidfVectorizer(sublinear_tf=False, analyzer='word', ngram_range=(1,2))

# tf-idf
features = tfidf.fit_transform(semkal).toarray()
features_name = tfidf.vocabulary_
print(len(features_name))

33503


In [60]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.30, random_state=42)

''' The important part '''
# ''' SVM classifier ''' 
# model
model_svm = LinearSVC()
# training
model_svm.fit(X_train, y_train)
# predict / testing
pred = model_svm.predict(X_test)

from sklearn.metrics import precision_score
from sklearn.metrics import classification_report

print(precision_score(y_test, pred, average='micro'))
print(classification_report(y_test, pred))

0.7364864864864865
              precision    recall  f1-score   support

           0       0.75      0.81      0.78       670
           1       0.72      0.64      0.68       514

   micro avg       0.74      0.74      0.74      1184
   macro avg       0.73      0.73      0.73      1184
weighted avg       0.74      0.74      0.73      1184



In [61]:
print(model_svm.predict([features[0]])[0])

0


In [62]:
semkal[0]

'i recently purchased the canon powershot g3 and am extremely satisfied with the purchase .'

In [63]:
len(df['review'])

3944

In [64]:
df['review'][596]

'rather heavy for point and shoot but a great camera for semi pros . '

## Double Propagation

### Rule 1.1 if a word A, whose POS is NN, is depended by an opinion word O through Dep, where Dep is one of the dependency relations amod, prep, nsubj, csubj, xsubj, dobj, and iobj, then A is an aspect.

### Rule 1.2 if an opinion word O and a word A, whose POS is NN, depend on a third word H through dependency relations Depi and Depj respectively, where Depi and Dep j are one of the relations amod, prep, nsubj, csubj, xsubj, dobj, and iobj, then A is an aspect.

### Rule 2.1 if a word O, whose POS is JJ (adjective), directly depends on an aspect A through dependency relation Dep, where Dep is one of the dependency rela- tions amod, prep, nsubj, csubj, xsubj, dobj, and iobj, then O is an opinion word.

### Rule 2.2 if a word O, whose POS is JJ, and an aspect A, directly depend on a third word H through relations Depi and Depj respectively, where Depi and Depj are one of the relations amod, prep, nsubj, csubj, xsubj, dobj, and iobj, then O is an opinion word.

### Rule 3.1 if a word Aj, whose POS is NN, directly depends on an aspect Ai through conj, then A j is an aspect

### Rule 3.2 if a word Aj, whose POS is NN, and an aspect Ai, directly depend on a third word H through the dependency relations De pi and De p j , where De pi and Depj are one of the relations in amod, prep, nsubj, csubj, xsubj, dobj, and conj, then A j is an aspect


### Rule 4.1 if a word Oj, whose POS is JJ, directly depends on an opinion word Oi through conj, then O j is an opinion word. 

### Rule 4.2 if a word Oj, whose POS is JJ, and an opinion word Oi, directly depend on a third word H through the dependance relations De pi and De p j , where Dep1 and Dep2 are one of the relations in amod, prep, nsubj, csubj, xsubj, dobj, and conj, then O j is an opinion word

In [65]:
# adding new dependencies for DP 'nmod' and'advmod'
dep_DP = ['amod', 'prep', 'nsubj', 'csubj', 'xsubj', 'dobj', 'iobj', 'advmod']
conj_DP = ['conj']

In [66]:

output = nlp.annotate('Pusheen and Smitha walked along the beach.', properties={
  'annotators': 'tokenize,ssplit,pos,depparse,parse',
  'outputFormat': 'json'
  })

In [67]:
from nltk import Tree
t = Tree.fromstring(output['sentences'][0]['parse'])

In [68]:
def chunking(text):
    chunking_noun = extract_candidate_chunks(text, r'NP: {<NN.*|JJ.*><.*>?<NN>}') 
    chunking_adj = extract_candidate_chunks(text, r'AP: {<JJ.*|RB.*><TO>?<VB.*>}') 
    #print(text)
  #  nlpParser.parser_annotators = 'tokenize,ssplit,pos,lemma,depparse'
    
    output = nlp.annotate(text, properties={
      'annotators': 'tokenize,ssplit,pos,depparse,parse',
      'outputFormat': 'json'
      })
    
    pp = Tree.fromstring(output['sentences'][0]['parse'])
   
    new_chunking = []
    for c in chunking_noun:
        for i in pp.subtrees(filter=lambda x: x.label() == 'NP'):
            lls = i.leaves()
            s = len(lls)
            match = 0
            for xx in lls:
                if xx in c:
                    match += 1

            if(match >= (s/2)):
                new_chunking.append(c)
        
                break

    for c in chunking_adj:
        #chunk_a.append(c)
        for i in pp.subtrees(filter=lambda x: x.label() == 'ADJP'):
            lls = i.leaves()
            s = len(lls)
            match = 0
            for xx in lls:
                if xx in c:
                    match += 1

            if(match >= (s/2)):
                new_chunking.append(c)
        
                break
        #new_chunking.append(c)

    for chunk in new_chunking:
        text = text.replace(chunk, '-'.join(chunk.split(' ')))
    
    return text

In [220]:
import pandas as pd
candidate_aspect = []
new_opinion = []
op_set = positive_lexicon + negative_lexicon

def double_propagation(O: op_set, reviews, using_chunking=True, using_objective_detection=False):
    o_expanded = O
    f = []
    is_stop = False
    flag_cycle = 0
    
    t_a_p = []
    a_p = []
    r_p = []
    
    while (not is_stop):
        f_i = []
        o_i = []
    
        index = 0
        for r in reviews:
            temp = []
            
            if using_objective_detection:
                if model_svm.predict([features[index]])[0] == 0:
                    index += 1
                    continue
            
            sentences = nltk.sent_tokenize(r)

            for sent in sentences:
                if using_chunking:
                    r = chunking(sent)
                else:
                    r = sent
                    
                parse = next(parser.raw_parse(r))

                #Rule 1.1
                for (w1, dep, w2) in list(parse.triples()):
                    if(dep in dep_DP):
                        # Rule 1.1
                        if(w1[0] in o_expanded):
                            if w2[1] == 'NN' and w2[0] not in f:
                                f_i.append(w2[0])
                                temp.append(w2[0])
            #                         candidate_aspect.append(chunk_check(r, w2[0]))
                        elif(w2[0] in o_expanded):          
                            if w1[1] == 'NN' and w1[0] not in f:
                                f_i.append(w1[0])
                                temp.append(w1[0])

            #                         candidate_aspect.append(chunk_check(r, w1[0]))


                # Rule 1.2
                for (w1, dep, w2) in parse.triples():
                    if(dep in dep_DP):
                        H = ''
                        O = ''
                        if w1[0] in o_expanded:
                            H = w2[0]
                            O = w1
                        elif w2[0] in o_expanded:
                            H = w1[0]
                            O = w2

                        if H:
                            for (w1, dep, w2) in list(parse.triples()):
                                if w1[0] == H and w2[0] != O[0]:
                                    if w2[1] == 'NN' and w2[0] not in f:
                                        f_i.append(w2[0])
                                        temp.append(w2[0])

            #                                 candidate_aspect.append(chunk_check(r, w2[0]))
                                elif w2[0] == H  and w1[0] != O[0]:
                                    if w1[1] == 'NN' and w1[0] not in f:
                                        f_i.append(w1[0])
                                        temp.append(w1[0])

            #                                 candidate_aspect.append(chunk_check(r, w1[0]))


                # Rule 4.1
                for (w1, dep, w2) in list(parse.triples()):
                    if(dep in conj_DP):
                        if w1[0] in o_expanded:
                            if w2[1] == 'JJ' and w2[0] not in o_expanded:
                                o_i.append(w2[0])

                        elif w2[0] in o_expanded:
                            if w1[1] == 'JJ' and w1[0] not in o_expanded:
                                o_i.append(w1[0])


                # Rule 4.2
                for (w1, dep, w2) in parse.triples():
                    if(dep in dep_DP or dep in conj_DP):
                        H = ''
                        O = ''
                        if w1[0] in o_expanded:
                            H = w2[0]
                            O = w1
                        elif w2[0] in o_expanded:
                            H = w1[0]
                            O = w2

                        if H:
                            for (w1, dep, w2) in list(parse.triples()):
                                if w1[0] == H and w2[0] != O[0]:
                                    if w2[1] == 'JJ' and w2[0] not in o_expanded:                  
                                        o_i.append(w2[0])
                                elif w2[0] == H  and w1[0] != O[0]:
                                    if w1[1] == 'JJ' and w1[0] not in o_expanded:
                                        o_i.append(w1[0])



            if flag_cycle == 0:
                r_p.append(r)
                a_array = []
                tes = []
                try:
                    tes = df['aspect'][index].split(', ')
                except:
                    tes = []
                for x in tes:
                    a_array.append(x.split('[')[0])
            
                a_p.append('|'.join(a_array))
                t_a_p.append('|'.join(temp))
            else:
                if len(temp) != 0:
                    t_a_p[index] += '|' + '|'.join(temp)
            index += 1
            

        #calculate target and opinion expanded
        f = f + f_i 
        o_expanded = o_expanded + o_i

        
        #reread review, and run rule 3.1, 3.2, 2.1, and 2.2
        index = 0
        f_ii = []
        o_ii = []
        for r in reviews:
            temp = []
            
            if using_objective_detection:
                if model_svm.predict([features[index]])[0] == 0:
                    index += 1
                    continue

            sentences = nltk.sent_tokenize(r)
            for sent in sentences:
                if using_chunking:
                    r = chunking(sent)
                else:
                    r = sent

                parse = next(parser.raw_parse(r))
                #Rule 3.1
                for (w1, dep, w2) in list(parse.triples()):
                    if(dep in conj_DP):
                        if(w1[0] in f_i): 
                            if w2[1] == 'NN' and w2[0] not in f:
                                f_ii.append(w2[0])
                                temp.append(w2[0])
            #                         candidate_aspect.append(chunk_check(r, w2[0]))
                        elif(w2[0] in f_i):          
                            if w1[1] == 'NN' and w1[0] not in f:
                                f_ii.append(w1[0])
                                temp.append(w1[0])
            #                         candidate_aspect.append(chunk_check(r, w1[0]))


                # Rule 3.2
                for (w1, dep, w2) in parse.triples():
                    if(dep in dep_DP or dep in conj_DP):
                        H = ''
                        O = ''
                        if w1[0] in f_i:
                            H = w2[0]
                            O = w1
                        elif w2[0] in f_i:
                            H = w1[0]
                            O = w2

                        if H:
                            for (w1, dep, w2) in list(parse.triples()):
                                if w1[0] == H and w2[0] != O[0]:
                                    if w2[1] == 'NN' and w2[0] not in f:
                                        f_ii.append(w2[0])  
                                        temp.append(w2[0])
            #                                 candidate_aspect.append(chunk_check(r, w2[0]))
                                elif w2[0] == H  and w1[0] != O[0]:
                                    if w1[1] == 'NN' and w1[0] not in f:
                                        f_ii.append(w1[0])
                                        temp.append(w1[0])
            #                                 candidate_aspect.append(chunk_check(r, w1[0]))


                # Rule 2.1
                for (w1, dep, w2) in list(parse.triples()):
                    if(dep in dep_DP):
                        if w1[0] in f_i:
                            if w2[1] == 'JJ' and w2[0] not in o_expanded:
                                o_ii.append(w2[0])

                        elif w2[0] in f_i:
                            if w1[1] == 'JJ' and w1[0] not in o_expanded:
                                o_ii.append(w1[0])


                # Rule 2.2
                for (w1, dep, w2) in parse.triples():
                    if(dep in dep_DP):
                        H = ''
                        O = ''
                        if w1[0] in f_i:
                            H = w2[0]
                            O = w1
                        elif w2[0] in f_i:
                            H = w1[0]
                            O = w2

                        if H:
                            for (w1, dep, w2) in list(parse.triples()):
                                if w1[0] == H and w2[0] != O[0]:
                                    if w2[1] == 'JJ' and w2[0] not in o_expanded:                  
                                        o_ii.append(w2[0])
                                elif w2[0] == H  and w1[0] != O[0]:
                                    if w1[1] == 'JJ' and w1[0] not in o_expanded:
                                        o_ii.append(w1[0])
            if len(temp) != 0:
                t_a_p[index] += '|' + '|'.join(temp)

            index += 1
            
        f_i = f_i + f_ii
        o_i = o_i + o_ii
        f = f + f_ii
        o_expanded = o_expanded + o_ii     
        
        flag_cycle = 1
        
        if(len(f_i) == 0 and len(o_i) == 0):
            out = pd.DataFrame(r_p)
            out['aspect'] = a_p
            out['prediction'] = t_a_p
            out.to_csv('hasil_dp.csv')
            is_stop = True
        
    return f, o_expanded

In [221]:
def calculate_frequency(aspects):
    aspect_frequency = {}
    
    for aspect in aspects:
        if(aspect in aspect_frequency):
            aspect_frequency[aspect] += 1
        else:
            aspect_frequency[aspect] = 1
            
    return aspect_frequency

In [222]:
def pruning_based_on_clause(aspect_frequency, reviews):
    pruning = []
    for review in reviews:
        parse = next(parser.raw_parse(review))
        for (w1, dep, w2) in list(parse.triples()):
            if(w1[0] in aspect_frequency and w2[0] in aspect_frequency):
                if(dep not in conj_DP):
                    if(aspect_frequency[w1[0]] > aspect_frequency[w2[0]]):
                        pruning.append(w2[0])
                    elif(aspect_frequency[w1[0]] < aspect_frequency[w2[0]]):
                        pruning.append(w1[0])
                
    return pruning

In [223]:
def pruning_based_other_products_and_dealers(aspect_frequency, reviews, window=3):
    pruning = []
    ProductINDI = ["compare to", "compare with", "better than", "worse than"]
    DealerINDI  = ["shop with", "buy from"]
    for review in reviews:
        if any(indication in review for indication in ProductINDI):
            tokens = nltk.word_tokenize(review)
            index = 0
            while index < len(tokens) - 1:
                if tokens[index] + " " + tokens[index + 1] in ProductINDI:
                    index += 2
                    for x in range(window):
                        next_window = index + x + 1;
                        if next_window < len(tokens) and tokens[next_window] in aspect_frequency:
                            pruning.append(tokens[next_window])
                else :
                    index += 1
                    
        if any(indication in review for indication in DealerINDI):
            tokens = nltk.word_tokenize(review)
            index = 0
            while index < len(tokens) - 1:
                if tokens[index] + " " + tokens[index + 1] in DealerINDI:
                    index += 2
                    for x in range(window):
                        next_window = index + x + 1;
                        if next_window < len(tokens) and tokens[next_window] in aspect_frequency:
                            pruning.append(tokens[next_window])
                else :
                    index += 1 
                        
    return pruning

In [224]:
def identify_target_phrase_global_pruning(review):
    
    pass

In [225]:
dp_aspect, opinion_expand = double_propagation(op_set, df['review'][0:597], False)

In [163]:
ap = calculate_frequency(dp_aspect)

In [164]:
pruning_clause = pruning_based_on_clause(ap,df['review'][0:597] );
print(pruning_clause)

['g3', 'powershot', 'trip', 'trip', 'picture', 'picture', 'card', 'use', 'use', 'job', 'worth', 'cent', 'flagship', 'powershot', 'flagship', 'series', 'powershot', 'megapixel', 'control', 'control', 'kind', 'type', 'cf', 'use', 'picture', 'awe', 'buy', 'g3', 'set', 'use', 'flaw', 'anyone', 'anyone', 'photo', 'programming', 'screen', '4mp', 'right', 'market', '4x', 'screen', 'move', 'focus', 'manual', 'focus', 'plastic', 'line', 'scoying', 'scoying', 'screen', '14x', 'software', 'computer', 'cap', 'worth', 'bargain', 'auto', 'right', 'photo', 'difference', 'range', 'powershot', 'auto', 'buy', 'battery', 'type', 'battery', 'type', 'g3', 'mb', 'cf', 'lever', 'lever', 'colorimetry', 'battery', 'company', 'g3', 'auto', 'shutter', 'slr', 'shutter', 'length', 'program', 'photo', 'manual', 'selection', 'shutter', 'access', 'love', 'cf', 'use', 'auto', 'shoot', 'night', 'drawback', 'market', 'picture', 'color', 'cover', 'tigt', 'work', 'wish', 'work', 'issue', 'seller', 'love', '4x', 'price', '

In [165]:
prun_dealer_product = pruning_based_other_products_and_dealers(ap, df['review'][0:597])
prun_dealer_product

['use', 'anything', 'camera']

In [166]:
def calculate_precision_recall(aspect, target):
    tp = 0
    """      
    
    for a in aspect: 
        if a in target:
             tp += 1
   
    """
    for t in target: 
        for a in aspect:
            zz = t.split(' ')
            vv =0
            for z in zz:
                if z in a:
                    vv+=1
            if len(zz) == vv:
                tp+=1
                break
    
    P = (tp * 1.0) / (len(aspect) * 1.0)
    R = (tp * 1.0) / (len(target) * 1.0)
#     print(P, len(aspect))
#     print(R, len(target))
    
    f1 = 2.0 * P * R / (P+R)
    
    return P, R, f1

In [167]:
import re
import math
import numpy as np
target = []

for t in df['aspect'][0:597]:
    if t is not np.nan:
        for s in t.split(', '):
            for x in s.split(','):
                jj = re.sub(r'\[[+|-]\d\]', '',x)
                jjs = re.sub(r'\[\w\]', '',jj)
                if(jjs):
                    target.append(jjs);
dp_aspect =  list(map(lambda aspect: ' '.join(aspect.split('-')), dp_aspect))               
candidate_aspect = list(map(lambda aspect: ' '.join(aspect.split('-')), candidate_aspect))


# candidate_aspect = list(map(lambda aspect: aspect), candidate_aspect))

In [168]:
#calculate_precision_recall(list(dict.fromkeys(candidate_aspect)), list(dict.fromkeys(target)))

In [169]:
calculate_precision_recall(list(dict.fromkeys(dp_aspect)), list(dict.fromkeys(target)))

(0.13846153846153847, 0.5094339622641509, 0.21774193548387097)

In [170]:
sorted_aspect_frequency = sorted(ap.items(), key=lambda kv: kv[1])

In [171]:
a_freq_more_1 = []
for x in sorted_aspect_frequency:
    if(x[1] > 1):
        a_freq_more_1.append(x[0])

In [172]:
calculate_precision_recall(a_freq_more_1, list(dict.fromkeys(target)))

(0.232, 0.27358490566037735, 0.2510822510822511)

In [173]:
print(df['review'][0:597])

0      i recently purchased the canon powershot g3 an...
1      the camera is very easy to use , in fact on a ...
2      after i took their picture with their camera ,...
3      i just told them , press halfway , wait for th...
4      they fired away and the picture turned out qui...
5      a few of my work constituants owned the g2 and...
6      i 'm easily enlarging pictures to 8 1/2 x 11 w...
7      ensure you get a larger flash , 128 or 256 , s...
8      bottom line , well made camera , easy to use ,...
9      i 'd highly recommend this camera for anyone w...
10                                    great job canon ! 
11                                                yep . 
12     this is my first digital camera , and what a '...
13     i am a software engineer and am very keen into...
14     just a little overview , powershot g3 is the f...
15     whether you are a novice or an expert , its ea...
16     + you can have different kind of lens if you w...
17     as its 4mp , you might n

In [174]:
opo = []

In [175]:
opo[1] = 'aka'

IndexError: list assignment index out of range

In [None]:
df['aspect'][0].split(', ')

In [282]:
import math

hasil = pd.read_csv("hasil_dp.csv", na_values=" kakaka")
target = hasil['prediction']
aspek = hasil['aspect']

true = 0
false = 0
count = 0
for x in range(0,596):
    tipe_aspek = type(aspek[x]) is str
    tipe_target = type(target[x]) is str

#     if tipe_aspek == tipe_target and tipe_aspek == True:
#         aspek_split = aspek[x].split('|')
#         for t in target[x].split('|'):
#             if t != '':
#                 if t in aspek_split:
#                     true += 1
#                     continue
    
    if target[x] == aspek[x]:
        true += 1
    elif tipe_aspek == tipe_target and tipe_aspek == False:
        true += 1
    else:
        false += 1
    count+=1
        
print(true, false, true+false, true/(true+false), count)

212 384 596 0.35570469798657717 596
