In [1]:
import nltk
from nltk.corpus import sentiwordnet as swn
from pycorenlp import StanfordCoreNLP
from pywsd.lesk import simple_lesk
from pywsd.lesk import cosine_lesk
from nltk.corpus import wordnet as wn
import requests
import sys, os
import numpy as np
import pandas as pd

Warming up PyWSD (takes ~10 secs)... took 4.85966157913208 secs.


In [2]:
nlp = StanfordCoreNLP('http://localhost:9000')
dependency_parser = nlp.annotate

In [3]:
breakdown = swn.senti_synset('good.n.01')
print(breakdown.pos_score())

0.5


In [4]:
wn.synsets('love')

[Synset('love.n.01'),
 Synset('love.n.02'),
 Synset('beloved.n.01'),
 Synset('love.n.04'),
 Synset('love.n.05'),
 Synset('sexual_love.n.02'),
 Synset('love.v.01'),
 Synset('love.v.02'),
 Synset('love.v.03'),
 Synset('sleep_together.v.01')]

In [5]:
positive_lexicon = []
negative_lexicon = []

def read_lexicon():
    global positive_lexicon;
    global negative_lexicon;
    
    with open(os.path.join(os.path.abspath('opinion-lexicon-English/') , 'positive-words.txt'), 'r') as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
         
        positive_lexicon = file.readlines()
    
    with open(os.path.join(os.path.abspath('opinion-lexicon-English/') , 'negative-words.txt'), 'r', encoding = "ISO-8859-1") as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
        
        negative_lexicon = file.readlines()
        
    positive_lexicon = list(map(lambda word: word.rstrip("\n\r"), positive_lexicon))
    negative_lexicon = list(map(lambda word: word.rstrip("\n\r"), negative_lexicon))
    
        
read_lexicon()
op_set = positive_lexicon + negative_lexicon

In [6]:
import sys, os
import re
import pandas as pd
import itertools, nltk, string 

def read_file(file):
    f = open(file, 'r')

    pattern_title = '\[t\]'
    pattern_sentence = '(?<=##).+'
    pattern_aspect = '.+(?=##)'

    review = []
    for a in f:
        if re.search('##', a):
            sentence = re.findall(pattern_sentence, a)[0]
            aspect = re.findall(pattern_aspect, a)
            if len(aspect) > 0:
                aspect = aspect[0]
            else:
                aspect = ''
            review.append((sentence, aspect))

    df = pd.DataFrame(columns=['review','target'])
    for r in review:
        df = df.append({'review': r[0], 'target': r[1]}, ignore_index=True)
        
    return df

In [7]:
def pos_tag(sentence):
    result = dependency_parser(sentence, properties={"outputFormat": "json", "annotators": "pos"})['sentences'][0]['tokens']
    res = []
    for pos in result:
        res.append((pos['word'], pos['pos']))
    return res

In [8]:
def get_score(sentence, key, tagged_sentence):
    flag = True
    
    tagged = ('','')
    for p in tagged_sentence:
        if p[0] == key:
            tagged = p
    if tagged == ('', ''):
        flag = False

    ambiguous = tagged[0]
    tag = tagged[1]
    pos = ''

    if 'NN' in tag:
        pos = 'n'
    elif 'NNS' in tag:
        pos = 'nns'
    elif 'VB' in tag:
        pos = 'v'
    elif 'VBG' in tag:
        pos = 'v'
    elif 'JJ' in tag:
        pos = 'a'
    elif 'RB' in tag:
        pos = 'r'
    else:
        flag = False

    endscore = 0
    if flag:
        answer = cosine_lesk(sentence, ambiguous, pos)
        if answer:
            score = swn.senti_synset(answer.name())

            if score.pos_score() > score.neg_score():
                endscore = score.pos_score()
            else:
                endscore = score.neg_score() * (-1)
    return endscore

In [9]:
def get_sentence_score(sentence):
    scores = []
    tagged_sentence = pos_tag(sentence)
    for word in sentence.split(' '):
        if word in op_set:
            score = get_score(sentence, word, tagged_sentence)
            scores.append(score)
    if len(scores):
        return np.mean(scores)
    return 0

In [17]:
def main_program(review, target, filename):
    scores = []
    labels = []
    index = 0
    for sentence in review:
        aspects = target[index].split(',')
        label = 0
        if aspects[0] != '':
            for aspect in aspects:
                try:
                    tanda = aspect.split('[')[1][0]
                    angka = aspect.split('[')[1][1]
                    if tanda == '+':
                        label += int(angka)
                    else:
                        label -= int(angka)
                except:
                    print(aspect)
        
        score = get_sentence_score(sentence)
        
        if label > 0:
            labels.append(1)
        elif label < 0:
            labels.append(2)
        else:            
            labels.append(0)
            
        if score > 0:
            scores.append(1)
        elif score < 0:
            scores.append(2)
        else:            
            scores.append(0)
            
        index += 1
    
    data = { 'review':review, 'label': labels, 'prediction': scores}
    out = pd.DataFrame(data)
    out.to_csv(filename)
    
    return labels, scores

In [18]:
df = read_file('dataset/bing_liu/Nikon coolpix 4300.txt')

In [19]:
label, pred = main_program(df['review'], df['target'], 'sentiwordnet.csv')

In [20]:
y_actu = pd.Series(label, name='Actual')
y_pred = pd.Series(pred, name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred)

In [21]:
df_confusion

Predicted,0,1,2
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,112,54,21
1,34,75,20
2,14,6,10


In [22]:
from pandas_ml import ConfusionMatrix
cm = ConfusionMatrix(label, pred)

In [23]:
cm.print_stats()

Confusion Matrix:

Predicted    0    1   2  __all__
Actual                          
0          112   54  21      187
1           34   75  20      129
2           14    6  10       30
__all__    160  135  51      346


Overall Statistics:

Accuracy: 0.569364161849711
95% CI: (0.5153378774354668, 0.6221930440346766)
No Information Rate: ToDo
P-Value [Acc > NIR]: 4.250370715909964e-05
Kappa: 0.27236030542970446
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                       0         1          2
Population                                  346       346        346
P: Condition positive                       187       129         30
N: Condition negative                       159       217        316
Test outcome positive                       160       135         51
Test outcome negative                       186       211        295
TP: True Positive                           112        75         10
TN: True Negative                           111       

  num = df[df > 1].dropna(axis=[0, 1], thresh=1).applymap(lambda n: choose(n, 2)).sum().sum() - np.float64(nis2 * njs2) / n2
