In [3]:
import nltk
from nltk.corpus import sentiwordnet as swn
from pycorenlp import StanfordCoreNLP
from pywsd.lesk import adapted_lesk
from pywsd.lesk import simple_lesk
from pywsd.lesk import cosine_lesk
from nltk.corpus import wordnet as wn
import requests
import sys, os
import numpy as np
import pandas as pd

Warming up PyWSD (takes ~10 secs)... took 3.465684652328491 secs.


In [4]:
nlp = StanfordCoreNLP('http://localhost:9000')
dependency_parser = nlp.annotate

In [5]:
breakdown = swn.senti_synset('high.n.01')
print(breakdown)

<high.n.01: PosScore=0.125 NegScore=0.0>


In [6]:
wn.synsets('love')

[Synset('love.n.01'),
 Synset('love.n.02'),
 Synset('beloved.n.01'),
 Synset('love.n.04'),
 Synset('love.n.05'),
 Synset('sexual_love.n.02'),
 Synset('love.v.01'),
 Synset('love.v.02'),
 Synset('love.v.03'),
 Synset('sleep_together.v.01')]

In [7]:
positive_lexicon = []
negative_lexicon = []

def read_lexicon():
    global positive_lexicon;
    global negative_lexicon;
    
    with open(os.path.join(os.path.abspath('../opinion-lexicon-English/') , 'positive-words.txt'), 'r') as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
         
        positive_lexicon = file.readlines()
    
    with open(os.path.join(os.path.abspath('../opinion-lexicon-English/') , 'negative-words.txt'), 'r', encoding = "ISO-8859-1") as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
        
        negative_lexicon = file.readlines()
        
    positive_lexicon = list(map(lambda word: word.rstrip("\n\r"), positive_lexicon))
    negative_lexicon = list(map(lambda word: word.rstrip("\n\r"), negative_lexicon))
    
        
read_lexicon()
op_set = positive_lexicon + negative_lexicon

In [8]:
def pos_tag(sentence):
    result = dependency_parser(sentence, properties={"outputFormat": "json", "annotators": "pos"})['sentences'][0]['tokens']
    res = []
    for pos in result:
        res.append((pos['word'], pos['pos']))
    return res
    
# def pos_tag(sentence):
#     '''url = "http://localhost:9000"
#     request_params = {"annotators": "pos"}
#     r = requests.post(url, data=sentence, params=request_params, timeout=120)
#     try:
#         results = r.json()['sentences'][0]['tokens']
#         res = []
#         for pos in results:
#             res.append((pos['word'], pos['pos']))
#         return res
#     except Exception as e:
#         print(e)
#         return []
#     '''
#     return nltk.pos_tag(nltk.word_tokenize(sentence))

In [9]:
negation = [
    "afraid",
    "can't",
    "cannot",
    "deny",
    "mean",
    "negate",
    "negation",
    "negative",
    "neither",
    "never",
    "no",
    "non",
    "none",
    "nor",
    "not",
    "nothing",
    "refusal",
    "refuse",
    "reject",
    "rejection",
#     "don't",
#     "shouldn't",
#     "wouldn't",
#     "didn't",
#     "though",
#     "although",
#     "wasn't",
#     "isn't",
#     "but"
]

from nltk.corpus import sentiwordnet as swn
import string

def predict_lexicon(opinions, sent):
    endscore = 0
    for opinion in opinions:
        if opinion in positive_lexicon:
            endscore += 1
        elif opinion in negative_lexicon:
            endscore -= 1

        words = sent.split(' ')
        word_around = []
        for x in range(0, len(words)):
            if words[x] in string.punctuation:
                continue
            try:
                if (words[x+1] == opinion) or (words[x+2] == opinion) or (words[x+3] == opinion) or (words[x+4] == opinion):
                    word_around.append(words[x])
                elif (words[x-1] == opinion):
                    word_around.append(words[x])
            except:
                pass
        for neg in negation:
            if neg in word_around:
                endscore *= (-1)
                break

    if endscore > 0:
        polarity = 'positive'
    else:
        polarity = 'negative'
           
    return polarity

def predict_sentiwordnet(opinions, sent):
    endscore = 0
    for opinion in opinions:
#         if opinion in positive_lexicon:
#             endscore += 1
#         elif opinion in negative_lexicon:
#             endscore -= 1
#         break
        try:
            score = swn.senti_synset(opinion + '.a.1')
            if score.pos_score() > score.neg_score():
                endscore += score.pos_score()
            else:
                endscore += score.neg_score() * (-1)

            words = sent.split(' ')
            word_around = []
            for x in range(0, len(words)):
                if words[x] in string.punctuation:
                    continue
                try:
                    if (words[x+1] == opinion) or (words[x+2] == opinion) or (words[x+3] == opinion) or (words[x+4] == opinion):
                        word_around.append(words[x])
                    elif (words[x-1] == opinion):
                        word_around.append(words[x])
                except:
                    pass
            for neg in negation:
                if neg in word_around:
                    endscore *= (-1)
                    break
        except:
            if opinion in op_set:
                print(opinion)
            pass
    if endscore > 0:
        polarity = 'positive'
    else:
        polarity = 'negative'
           
    return polarity, endscore

def predict_sentiwordnet_lesk(opinions, sent):
    endscore = 0
    for opinion in opinions:
#         if opinion in positive_lexicon:
#             endscore += 1
#         elif opinion in negative_lexicon:
#             endscore -= 1
#         break
        try:
            tagged_sentence = pos_tag(sent)
            op_with_tag = ('','')
            for word, tag in tagged_sentence:
                if opinion == word:
                    op_with_tag = (word, tag)
                    break
                    
            pos = ''
            if 'NN' in op_with_tag[1]:
                pos = 'n'
            elif 'JJ' in op_with_tag[1]:
                pos = 'a'
            elif 'VB' in op_with_tag[1]:
                pos = 'v'
            elif 'RB' in op_with_tag[1]:
                pos = 'r'
            else:
                pos = 'a'
            
            score = swn.senti_synset(simple_lesk(sent, opinion, pos).name())
            if score.pos_score() > score.neg_score():
                endscore += score.pos_score()
            else:
                endscore += score.neg_score() * (-1)
                
            words = sent.split(' ')
            word_around = []
            for x in range(0, len(words)):
                if words[x] in string.punctuation:
                    continue
                try:
                    if (words[x+1] == opinion) or (words[x+2] == opinion) or (words[x+3] == opinion) or (words[x+4] == opinion):
                        word_around.append(words[x])
                    elif (words[x-1] == opinion):
                        word_around.append(words[x])
                except:
                    pass
            
            for neg in negation:
                if neg in word_around:
                    endscore *= (-1)
                    break
           
        except Exception as e:
            print(e)
            pass
            
    if endscore > 0:
        polarity = 'positive'
    else:
        polarity = 'negative'
           
    return polarity, endscore

In [10]:
def calculate(category, tipe):
    df = pd.read_csv('Results/Sentiwordnet/hasil_'+ tipe + category +'.csv')
    tp, tn, fp, fn = 0, 0, 0, 0
    p = 'positive'
    n = 'negative'
    for index in range(0, len(df)):
        label = df['label'][index]
        predict = df['predict'][index]
        
        if label == p and predict == p:
            tp += 1
        elif label == n and predict == n:
            tn += 1
        elif label == n and predict == p:
            fp += 1
        elif label == p and predict == n:
            fn += 1
    
    precision = round(tp / (tp + fp), 2)
    recall = round(tp / (tp + fn), 2)
    f1 = round(2 * ((precision * recall) / (precision + recall)), 2)
    accuracy = round((tp+tn)/(tp+tn+fp+fn), 2)
    print(tp, '\t', tn, '\t', fp, '\t', fn, '\t', '\t', precision, '\t', recall, '\t', f1, '\t', '\t', accuracy)
    return tp, tn, fp, fn, precision, recall, f1, accuracy

In [11]:
def preprocessing(sentence):
    return sentence.lower()

def calculate_f1(p, r):
    return round(2*((p*r)/(p+r)),2)

In [12]:
import numpy as np

def print_hasil(tipe):
    print('tp', '\t', 'tn', '\t', 'fp', '\t', 'fn', '\t', '\t', 'prec', '\t', 'rec', '\t', 'f1', '\t', '\t', 'accuracy')
    precision = []
    recall = []
    f1 = []
    cat = ['AMBIENCE', 'FOOD', 'SERVICE', 'PRICES']
    for c in cat:
        a, b, c, d, e, f, g, h = calculate(c, tipe)
        precision.append(e)
        recall.append(f)
        f1.append(g)
#         sf = sf.append({'TP':a, 'TN':b, 'FP':c, 'FN':d, 'precision':e, 'recall':f, 'f1':g, 'accuracy': h}, ignore_index=True)
#         sf.to_excel("Results/Calculation/"+ tipe +".xlsx")
    print('', '\t', '', '\t', '', '\t', '', '\t', '\t', round(np.mean(precision),2), '\t', round(np.mean(recall),2), '\t', calculate_f1(round(np.mean(precision),2), round(np.mean(recall),2)), '\t', '\t')

In [13]:
import pandas as pd
def run_lex(category):
    df = pd.read_csv('Results/'+ category +'.csv')
    sf = pd.DataFrame(columns = ['id','review', 'opinion', 'label', 'predict'])
    for index in range(0, len(df)):
        opinion = []
        if type(df['opinion'][index]) != float:
            opinion = df['opinion'][index].split('|')
            
        prediction = predict_lexicon(opinion, preprocessing(df['review'][index]))
        sf = sf.append({'id': df['id'][index], 
            'review': df['review'][index],
            'opinion': df['opinion'][index],
            'label': df['label'][index],
            'predict': prediction
           }, ignore_index=True)
    
    sf.to_csv("Results/Sentiwordnet/hasil_lex_"+ category +".csv")
    sf.to_excel("Results/Sentiwordnet/hasil_lex_"+ category +".xlsx")

In [14]:
import pandas as pd
def run(category):
    df = pd.read_csv('Results/'+ category +'.csv')
    sf = pd.DataFrame(columns = ['id','review', 'opinion', 'label', 'predict','score'])
    for index in range(0, len(df)):
        opinion = []
        if type(df['opinion'][index]) != float:
            opinion = df['opinion'][index].split('|')
            
        prediction, score = predict_sentiwordnet(opinion, preprocessing(df['review'][index]))
        sf = sf.append({'id': df['id'][index], 
            'review': df['review'][index],
            'opinion': df['opinion'][index],
            'label': df['label'][index],
            'predict': prediction,
            'score': score
           }, ignore_index=True)
    
    sf.to_csv("Results/Sentiwordnet/hasil_"+ category +".csv")
    sf.to_excel("Results/Sentiwordnet/hasil_"+ category +".xlsx")

In [15]:
import pandas as pd
def run_lesk(category):
    df = pd.read_csv('Results/'+ category +'.csv')
    sf = pd.DataFrame(columns = ['id','review', 'opinion', 'label', 'predict','score'])
    for index in range(0, len(df)):
        opinion = []
        if type(df['opinion'][index]) != float:
            opinion = df['opinion'][index].split('|')
            
        prediction, score = predict_sentiwordnet_lesk(opinion, preprocessing(df['review'][index]))
        sf = sf.append({'id': df['id'][index], 
            'review': df['review'][index],
            'opinion': df['opinion'][index],
            'label': df['label'][index],
            'predict': prediction,
            'score': score
           }, ignore_index=True)
    
    sf.to_csv("Results/Sentiwordnet/hasil_lesk_"+ category +".csv")
    sf.to_excel("Results/Sentiwordnet/hasil_lesk_"+ category +".xlsx")

In [16]:
run_lex('FOOD')
run_lex('AMBIENCE')
run_lex('SERVICE')
run_lex('PRICES')

In [17]:
run('FOOD')
run('AMBIENCE')
run('SERVICE')
run('PRICES')

delight
delight
bargain
recommend
enjoy
consistently
gem
fun
recommend
kills
enjoy
attraction
hang
fun
ghetto
warmer
love
love
recommend
intimidate
enjoy
trouble
exceeded


In [18]:
# run_lesk('FOOD')
# run_lesk('AMBIENCE')
# run_lesk('SERVICE')
# run_lesk('PRICES')

In [19]:
print_hasil('lex_') # lexicon

tp 	 tn 	 fp 	 fn 	 	 prec 	 rec 	 f1 	 	 accuracy
58 	 9 	 3 	 21 	 	 0.95 	 0.73 	 0.83 	 	 0.74
59 	 23 	 2 	 22 	 	 0.97 	 0.73 	 0.83 	 	 0.77
48 	 33 	 6 	 15 	 	 0.89 	 0.76 	 0.82 	 	 0.79
26 	 11 	 3 	 11 	 	 0.9 	 0.7 	 0.79 	 	 0.73
 	  	  	  	 	 0.93 	 0.73 	 0.82 	 	


In [20]:
print_hasil('') # sentiwordnet

tp 	 tn 	 fp 	 fn 	 	 prec 	 rec 	 f1 	 	 accuracy
42 	 10 	 2 	 37 	 	 0.95 	 0.53 	 0.68 	 	 0.57
42 	 22 	 3 	 39 	 	 0.93 	 0.52 	 0.67 	 	 0.6
32 	 31 	 8 	 31 	 	 0.8 	 0.51 	 0.62 	 	 0.62
19 	 11 	 3 	 18 	 	 0.86 	 0.51 	 0.64 	 	 0.59
 	  	  	  	 	 0.88 	 0.52 	 0.65 	 	


In [20]:
# print_hasil('lesk_') # sentiwordnet + wsd

In [21]:
'long' in op_set

False