In [1]:
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import sentiwordnet as swn
import math
from math import exp, expm1, log, log10
import numpy as np
import turtle
import pandas as pd
from nltk.wsd import lesk
from pycorenlp import StanfordCoreNLP
from pywsd.lesk import simple_lesk
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
import sys, os


nlp = StanfordCoreNLP('http://localhost:9000')
dependency_parser = nlp.annotate
positive_lexicon = []
negative_lexicon = []

def read_lexicon():
    global positive_lexicon;
    global negative_lexicon;
    
    with open(os.path.join(os.path.abspath('opinion-lexicon-English/') , 'positive-words.txt'), 'r') as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
         
        positive_lexicon = file.readlines()
    
    with open(os.path.join(os.path.abspath('opinion-lexicon-English/') , 'negative-words.txt'), 'r', encoding = "ISO-8859-1") as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
        
        negative_lexicon = file.readlines()
        
    positive_lexicon = list(map(lambda word: word.rstrip("\n\r"), positive_lexicon))
    negative_lexicon = list(map(lambda word: word.rstrip("\n\r"), negative_lexicon))
    
        
read_lexicon()
op_set = positive_lexicon + negative_lexicon

negation = [
    "afraid",
    "can't",
    "cannot",
    "deny",
    "mean",
    "negate",
    "negation",
    "negative",
    "neither",
    "never",
    "no",
    "non",
    "none",
    "nor",
    "not",
    "nothing",
    "refusal",
    "refuse",
    "reject",
    "rejection"
]

def analyse_file(key, lines):    
    radii = get_TDOC(lines, key)    
    return radii

def get_TDOC(lines, key):
    freq = {'Init': 0}              #Number of times context term occurs with key
    freq.clear()
    prohib = [''] #stopWords
    for line in lines:
        words = line.split(" ")
        if key in words:
            for context in words:
                flag=0
                for i in prohib:
                    if i == context:
                        flag=1
                        break
                if flag==0 and context!=key and context in op_set:
                    freq.setdefault(context, 0)
                    freq[context] = freq.get(context) + 1
                                           
    N = 0                           #Total Number of terms in Document
    for line in lines:
        words = line.split(" ")
        N += len(words)

    Nci = {'Init': 0}               #Total terms that occur with context term
    Nci.clear()
    for context in freq.keys():
        for line in lines:
            words = line.split(" ")
            if context in words:
                Nci.setdefault(context, 0)
                Nci[context] += len(words)

    radii = {'Init': 0}             #Get Radius of context term with TDOC formula
    radii.clear()
    
    df = pd.DataFrame(columns=['c', 'm', 'N', 'Nc', 'f', 'N/Nc', 'log(N/Nc)', 'fxlog(N/Nc)', '/4'])
    max_value = 0
    for term in freq.keys():
        radii[term] = (freq[term]*(log(N/Nci[term])))
        
        if radii[term] > max_value:
            max_value = radii[term]
        
    for term in freq.keys():
        radii[term] = radii[term]/max_value
        
        df = df.append({'c': term,
                'm': key,
                'N': N,
                'Nc': Nci[term],
                'f': freq[term],
                'N/Nc': "{0:.2f}".format(N/Nci[term]),
                'log(N/Nc)': "{0:.2f}".format(log(N/Nci[term])),
                'fxlog(N/Nc)': "{0:.2f}".format(freq[term]*(log(N/Nci[term]))),
                'normalisasi': "{0:.2f}".format((freq[term]*(log(N/Nci[term])))/max_value)
               }, ignore_index=True)
    
#     df.to_excel("tdoc2.xlsx")
    return radii                    #Returns entire set of context terms related to key

def pos_tag(sentence):
    result = dependency_parser(sentence, properties={"outputFormat": "json", "annotators": "pos"})['sentences'][0]['tokens']
    res = []
    for pos in result:
        res.append((pos['word'], pos['pos']))
    return res

def get_theta(key, sentences):
    scores = []
    for sentence in sentences:
        flag = True
        
        pp_tagged = pos_tag(sentence)
        tagged = ('','')
        for p in pp_tagged:
            if p[0] == key:
                tagged = p
        if tagged == ('', ''):
            flag = False
        
        ambiguous = tagged[0]
        tag = tagged[1]
        pos = ''

        if 'NN' in tag:
            pos = 'n'
        elif 'NNS' in tag:
            pos = 'nns'
        elif 'VB' in tag:
            pos = 'v'
        elif 'VBG' in tag:
            pos = 'v'
        elif 'JJ' in tag:
            pos = 'a'
        elif 'RB' in tag:
            pos = 'r'
        else:
            flag = False

        if flag:
            answer = simple_lesk(sentence, ambiguous, pos)
            if answer:
                score = swn.senti_synset(answer.name())
                endscore = 0
                
                if score.pos_score() > score.neg_score():
                    endscore = score.pos_score()
                else:
                    endscore = score.neg_score() * (-1)
                
                words = sentence.split(' ')
                word_around = []
                for x in range(0, len(words)):
                    try:
                        if (words[x+1] == key) or (words[x+2] == key) or (words[x+3]== key):
                            word_around.append(words[x])
                        elif (words[x-1] == key) or (words[x-2] == key) or (words[x-3]== key):
                            word_around.append(words[x])
                    except:
                        pass
                    
                for neg in negation:
                    if neg in word_around:
                        endscore *= (-1)
                        break
                        
                scores.append(endscore)
            else:
                scores.append(0)
        else:
            scores.append(0)
            
    final_score = np.average(scores)
    return np.pi * final_score

def prior_sentiment(radii, key, all_sentences):
    theta = {'Init': 0}
    theta.clear()
    for word in radii.keys():
        sentences = []
        for sentence in all_sentences:
            words = sentence.split(' ')
            if (word in words) and (key in words):
                sentences.append(sentence)
                
        filter = get_theta(word, sentences)            #if function returns 0 word does not exist in lexicon
        theta[word] = filter
        
    return theta

def senti(key, lines):
    radii = analyse_file(key, lines)
    theta = prior_sentiment(radii, key, lines)
    return theta

Warming up PyWSD (takes ~10 secs)... took 4.472042560577393 secs.


In [2]:
import sys, os
import re
import pandas as pd
import itertools, nltk, string 

def read_file(file):
    f = open(file, 'r')

    pattern_title = '\[t\]'
    pattern_sentence = '(?<=##).+'
    pattern_aspect = '.+(?=##)'

    review = []
    for a in f:
        if re.search('##', a):
            sentence = re.findall(pattern_sentence, a)[0]
            aspect = re.findall(pattern_aspect, a)
            if len(aspect) > 0:
                aspect = aspect[0]
            else:
                aspect = ''
            review.append((sentence, aspect))

    df = pd.DataFrame(columns=['review','target'])
    for r in review:
        df = df.append({'review': r[0], 'target': r[1]}, ignore_index=True)
        
    return df

In [116]:
def main_program(review, target, theta, filename):
    predicts = []
    labels = []
    index = 0
    
    senti = {}
    
    for sentence in review:
        aspects = target[index].split(',')
        label = 0
        score = 0  
        if aspects[0] != '':
            for aspect in aspects:
                feature = aspect.split('[')[0]
                tanda = aspect.split('[')[1][0]
                angka = aspect.split('[')[1][1]
                
            
                if tanda == '+':
                    label += int(angka)
                else:
                    label -= int(angka)

                                  
                for word in sentence.split(' '):
                    if word in op_set:
                        try:
                            score += theta[feature][word]
                        except:
                            score = score
                            
            if label >= 0:
                labels.append(1)
            elif label < 0:
                labels.append(2)
#             else:            
#                 labels.append(0)

            if score >= 0:
                predicts.append(1)
            elif score < 0:
                predicts.append(2)
#             else:
#                 predicts.append(0)
                        

        else:
            labels.append(0)
            predicts.append(0)
            
        index += 1
        
    data = {'label': labels, 'prediction': predicts}
    out = pd.DataFrame(data)
    out.to_csv(filename)
    
    return labels, predicts

In [117]:
# all_theta = {}
# all_target = []
# for target in df['target']:
#     aspects = target.split(', ')
#     if aspects[0] != '':
#         for aspect in aspects:
#             feature = aspect.split('[')[0]
#             if feature not in all_target:
#                 all_theta[feature] = senti(feature, df['review'])
#                 all_target.append(feature)

In [118]:
df = read_file('dataset/bing_liu/Nikon coolpix 4300.txt')

In [119]:
label, pred = main_program(df['review'], df['target'], all_theta, 'sentiwordnet.csv')

In [120]:
y_actu = pd.Series(label, name='Actual')
y_pred = pd.Series(pred, name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred)
df_confusion

Predicted,0,1,2
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,186,0,0
1,0,116,14
2,0,25,5


In [121]:
from pandas_ml import ConfusionMatrix
cm = ConfusionMatrix(label, pred)
cm.print_stats()

Confusion Matrix:

Predicted    0    1   2  __all__
Actual                          
0          186    0   0      186
1            0  116  14      130
2            0   25   5       30
__all__    186  141  19      346


Overall Statistics:

Accuracy: 0.8872832369942196
95% CI: (0.8491447202686204, 0.9186086931747108)
No Information Rate: ToDo
P-Value [Acc > NIR]: 9.938995662017918e-45
Kappa: 0.7962247055270312
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                       0          1          2
Population                                  346        346        346
P: Condition positive                       186        130         30
N: Condition negative                       160        216        316
Test outcome positive                       186        141         19
Test outcome negative                       160        205        327
TP: True Positive                           186        116          5
TN: True Negative                           160

  num = df[df > 1].dropna(axis=[0, 1], thresh=1).applymap(lambda n: choose(n, 2)).sum().sum() - np.float64(nis2 * njs2) / n2
  return(np.float64(self.TPR) / self.FPR)


In [62]:
type(senti('love', ['i love you', 'you love me']))

dict