In [54]:
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
import matplotlib.pyplot as plt
from nltk.corpus import sentiwordnet as swn
import math
from math import exp, expm1, log, log10
import numpy as np
import turtle
import pandas as pd
from nltk.wsd import lesk
from pycorenlp import StanfordCoreNLP
from pywsd.lesk import adapted_lesk
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
import sys, os


nlp = StanfordCoreNLP('http://localhost:9000')
dependency_parser = nlp.annotate
positive_lexicon = []
negative_lexicon = []

def read_lexicon():
    global positive_lexicon;
    global negative_lexicon;
    
    with open(os.path.join(os.path.abspath('opinion-lexicon-English/') , 'positive-words.txt'), 'r') as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
         
        positive_lexicon = file.readlines()
    
    with open(os.path.join(os.path.abspath('opinion-lexicon-English/') , 'negative-words.txt'), 'r', encoding = "ISO-8859-1") as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
        
        negative_lexicon = file.readlines()
        
    positive_lexicon = list(map(lambda word: word.rstrip("\n\r"), positive_lexicon))
    negative_lexicon = list(map(lambda word: word.rstrip("\n\r"), negative_lexicon))
    
        
read_lexicon()
op_set = positive_lexicon + negative_lexicon

negation = [
    "afraid",
    "can't",
    "cannot",
    "deny",
    "mean",
    "negate",
    "negation",
    "negative",
    "neither",
    "never",
    "no",
    "non",
    "none",
    "nor",
    "not",
    "nothing",
    "refusal",
    "refuse",
    "reject",
    "rejection"
]

def analyse_file(key, lines):    
    radii = get_TDOC(lines, key)    
    return radii

def get_TDOC(lines, key):
    freq = {'Init': 0}              #Number of times context term occurs with key
    freq.clear()
    prohib = [''] #stopWords
    for line in lines:
        words = line.split(" ")
        if key in words:
            for context in words:
                flag=0
                for i in prohib:
                    if i == context:
                        flag=1
                        break
                if flag==0 and context!=key:# and context in op_set:
                    freq.setdefault(context, 0)
                    freq[context] = freq.get(context) + 1
                                           
    N = 0                           #Total Number of terms in Document
    for line in lines:
        words = line.split(" ")
        N += len(words)

    Nci = {'Init': 0}               #Total terms that occur with context term
    Nci.clear()
    for context in freq.keys():
        for line in lines:
            words = line.split(" ")
            if context in words:
                Nci.setdefault(context, 0)
                Nci[context] += len(words)

    radii = {'Init': 0}             #Get Radius of context term with TDOC formula
    radii.clear()
    
    df = pd.DataFrame(columns=['c', 'm', 'N', 'Nc', 'f', 'N/Nc', 'log(N/Nc)', 'fxlog(N/Nc)', '/4'])
    max_value = 0
    for term in freq.keys():
        radii[term] = (freq[term]*(log(N/Nci[term])))
        
        if radii[term] > max_value:
            max_value = radii[term]
        
    for term in freq.keys():
        radii[term] = radii[term]/max_value
        
        df = df.append({'c': term,
                'm': key,
                'N': N,
                'Nc': Nci[term],
                'f': freq[term],
                'N/Nc': "{0:.2f}".format(N/Nci[term]),
                'log(N/Nc)': "{0:.2f}".format(log(N/Nci[term])),
                'fxlog(N/Nc)': "{0:.2f}".format(freq[term]*(log(N/Nci[term]))),
                'normalisasi': "{0:.2f}".format((freq[term]*(log(N/Nci[term])))/max_value)
               }, ignore_index=True)
    
#     df.to_excel("tdoc2.xlsx")
    return radii                    #Returns entire set of context terms related to key

def pos_tag(sentence):
    result = dependency_parser(sentence, properties={"outputFormat": "json", "annotators": "pos"})['sentences'][0]['tokens']
    res = []
    for pos in result:
        res.append((pos['word'], pos['pos']))
    return res

def get_theta(key, sentences):
    scores = []
    for sentence in sentences:
        flag = True
        
        pp_tagged = pos_tag(sentence)
        tagged = ('','')
        for p in pp_tagged:
            if p[0] == key:
                tagged = p
        if tagged == ('', ''):
            flag = False
        
        ambiguous = tagged[0]
        tag = tagged[1]
        pos = ''

        if 'NN' in tag:
            pos = 'n'
        elif 'NNS' in tag:
            pos = 'nns'
        elif 'VB' in tag:
            pos = 'v'
        elif 'VBG' in tag:
            pos = 'v'
        elif 'JJ' in tag:
            pos = 'a'
        elif 'RB' in tag:
            pos = 'r'
        else:
            flag = False

        if flag:
            answer = adapted_lesk(sentence, ambiguous, pos)
            if answer:
                score = swn.senti_synset(answer.name())
                endscore = 0
                
                if score.pos_score() > score.neg_score():
                    endscore = score.pos_score()
                else:
                    endscore = score.neg_score() * (-1)
                
#                 words = sentence.split(' ')
#                 word_around = []
#                 for x in range(0, len(words)):
#                     try:
#                         if (words[x+1] == key) or (words[x+2] == key) or (words[x+3]== key):
#                             word_around.append(words[x])
#                         elif (words[x-1] == key) or (words[x-2] == key) or (words[x-3]== key):
#                             word_around.append(words[x])
#                     except:
#                         pass
                    
                for neg in negation:
                    if neg in sentence:
                        endscore *= (-1)
                        break
                        
                scores.append(endscore)
            else:
                scores.append(0)
        else:
            scores.append(0)
            
    final_score = np.average(scores)
    return np.pi * final_score

def prior_sentiment(radii, key, all_sentences):
    theta = {'Init': 0}
    theta.clear()
    for word in radii.keys():
        sentences = []
        for sentence in all_sentences:
            words = sentence.split(' ')
            if (word in words) and (key in words):
                sentences.append(sentence)
                
        filter = get_theta(word, sentences)            #if function returns 0 word does not exist in lexicon
        theta[word] = filter
        
    return theta

def senti(key, lines):
    radii = analyse_file(key, lines)
    theta = prior_sentiment(radii, key, lines)
    return theta

[nltk_data] Downloading package stopwords to C:\Users\Farza
[nltk_data]     Nurifan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Farza Nurifan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [55]:
import sys, os
import re
import pandas as pd
import itertools, nltk, string 

def read_file(file):
    f = open(file, 'r')

    pattern_title = '\[t\]'
    pattern_sentence = '(?<=##).+'
    pattern_aspect = '.+(?=##)'

    review = []
    for a in f:
        if re.search('##', a):
            sentence = re.findall(pattern_sentence, a)[0]
            aspect = re.findall(pattern_aspect, a)
            if len(aspect) > 0:
                aspect = aspect[0]
            else:
                aspect = ''
            review.append((sentence, aspect))

    df = pd.DataFrame(columns=['review','target'])
    for r in review:
        df = df.append({'review': r[0], 'target': r[1]}, ignore_index=True)
        
    return df

In [56]:
def main_program(review, target, theta, filename, tp):
    predicts = []
    labels = []
    index = 0
    
    senti = {}
    
    for sentence in review:
        aspects = target[index].split(',')
        label = 0
        score = 0  
        if aspects[0] != '':
            for aspect in aspects:
                feature = aspect.split('[')[0]
                tanda = ''
                try:
                    tanda = aspect.split('[')[1][0]
                    angka = aspect.split('[')[1][1]
                except:
                    pass
                
                try:
                    if tanda == '+':
                        label += int(angka)
                    else:
                        label -= int(angka)
                except:
                    pass
                          

        try:
            aspek_pasangan = tp[index].split('|')
            if aspek_pasangan[0] != '':
                for ap in aspek_pasangan:
                    feature = ap.split('!')[0]
                    for word in sentence.split(' '):
                        if word in op_set:
                            try:
                                score += theta[feature][word]
                            except:
                                score = score
        except:
            k = 1
                
        if label != 0:
            if label > 0:
                labels.append(1)
            elif label < 0:
                labels.append(2)

            if score >= 0:
                predicts.append(1)
            elif score < 0:
                predicts.append(2)
                        
            
        index += 1
        
    data = {'label': labels, 'prediction': predicts}
    out = pd.DataFrame(data)
    out.to_csv(filename)
    
    return labels, predicts

In [57]:
filename = [
#             'Apex AD2600 Progressive-scan DVD player',
#             'Canon G3',
#             'Creative Labs Nomad Jukebox Zend Xtra 40GB',
            'Nikon coolpix 4300',
#             'Nokia 6610'
]

def run_all():
    out_file = pd.DataFrame(columns=['name', 'TP', 'TN', 'FP', 'FN'])
    for file in filename:
        print(file)
        TP = 0
        TN = 0
        FP = 0
        FN = 0
        
        df = read_file('dataset/bing_liu/' + file + '.txt')
        pred_f = pd.read_csv('hasil_clause/' + file + '.csv')
        
        all_theta = {}
        all_target = []
        for target in pred_f['prediction']:
            try:
                aspects = target.split('|')
                if aspects[0] != '':
                    for aspect in aspects:
                        feature = aspect.split('!')[0]
                        if feature not in all_target:
                            all_theta[feature] = senti(feature, df['review'])
                            all_target.append(feature)
            except:
                pass
                        
        label, pred = main_program(preprocessing(df['review']), df['target'], all_theta, 'sentiwordnet.csv', pred_f['prediction'])
        for x in range(0, len(label)):
            l = label[x]
            p = pred[x]
            if l == 1 and p == 1:
                TP += 1
            elif l == 2 and p == 2:
                TN += 1
            elif l == 1 and p == 2:
                FP += 1
            elif l == 2 and p == 1:
                FN += 1
        
        out_file = out_file.append({'name': file, 'TP': TP, 'TN': TN, 'FP': FP, 'FN': FN}, ignore_index=True)
        out_file.to_csv("result-senticircle-v2.csv")
        out_file.to_excel("result-senticircle-v2.xlsx")

In [59]:
run_all()

Nikon coolpix 4300
camera!perfect
nan
nan
nan
|annie-lebovitz!able
job!fine|manual!job
love!love|auto-focus!love
flash!compact|mb!compact|battery!rechargable|battery!compact|battery!flash|unit!compact
mb-cf!good|battery!second|battery!mb-cf|vacation!long
camera!recommend
|experience!digital
pre-production!positive|budget!positive|budget!pre-production
nan
scene!easy
mode!rich|feature!mode
macro-mode!exceptional
battery-life!good|time!first
mode!fine|card!16mb|lexar!16mb
nan
nan
|coolpix!nikon
amount!immense|use!immense
tune!fine|scene!tune
camera!ideal
nan
nan
nan
nan
nan
picture!good|mp!other|nikon!headaches
nan
|way!card-reader|frustration!many|card-reader!only
nan
nan
nan
lexar!wrong|nothing!wrong
|battery!low
camera!good
nan
nan
nan
nan
software!card-reader
|way!direct|directory!different|directory!way|option!way|time!way
nan
nan
nan
nan
nan
image-size!fine|picture!fine|viking-cf!simpler
nan
camera!good|picture-clarity!good|close-up-shooting-capability!exceptional|close-up-shooting

autofocus!work ----
battery!low ----
power!enough ----
case!power ----
battery-life!ok ----
battery!second ----
cf!second ----
plan!battery ----
camera!great ----
t!great ----
price!great ----
price!camera ----
quality!superb ----
size!perfect ----
quality!excellent ----
package!durable ----
brand!nikon ----
nikon!package ----
idea!good ----
battery!extra ----
hanging!left ----
auto-focus!nice ----
issue!issue ----
zoom!optical ----
3x!optical ----
zoom!optical ----
weight!optical ----
size!weight ----
glass!zoom ----
length!glass ----
zoom!digital ----
top!digital ----
hinderance!manual ----
shudder-button!depressing ----
reality!drawbacks ----
camera!hate ----
camera!awesome ----
field!well ----
macro!excellent ----
lot!other ----
time!waste ----
fan!nikon ----
slr-nikon-fe!fan ----
n50!slr-nikon-fe ----
camera!useful ----
money!little ----
time!little ----
moment!good ----
picture!money ----
download!moment ----
battery!handy ----
en-el!handy ----
situation!handy ----
camera!recomme

In [7]:
pred_f = pd.read_csv('hasil_clause/' + 'Canon G3' + '.csv')

In [46]:
def preprocessing(sentences):
    res = []
    for sentence in sentences:
        try:
            res.append(sentence.replace("n't", "not").replace("'m", "am"))
        except:
            pass
    return res

In [47]:
preprocessing(pred_f['0'])

['i recently purchased the canon-powershot-g3 and am extremely satisfied with the purchase . ',
 'the camera is very easy to use , in fact on a recent trip this past week i was asked to take a picture of a vacationing elderly group . ',
 'after i took their picture with their camera , they offered to take a picture of us . ',
 'i just told them , press halfway , wait for the box to turn green and press the rest of the way . ',
 'they fired away and the picture turned out quite nicely . ( as all of my pictures have thusfar ) . ',
 'a few of my work constituants owned the g2 and highly recommended the canon for picture-quality . ',
 'i am easily enlarging pictures to 8 1/2 x 11 with no visable loss in picture-quality and not even using the best possible setting as yet ( super fine ) . ',
 "ensure you get a larger flash , 128 or 256 , some are selling with the larger flash , 32mb will do in a pinch but you 'll quickly want a larger flash-card as with any of the 4mp cameras . ",
 'bottom l

In [44]:
pred_f

Unnamed: 0.1,Unnamed: 0,0,aspect,prediction
0,0,i recently purchased the canon-powershot-g3 an...,canon powershot g3!+,
1,1,"the camera is very easy to use , in fact on a ...",use!+,camera!easy|fact!camera|week!past|trip!past
2,2,"after i took their picture with their camera ,...",,
3,3,"i just told them , press halfway , wait for th...",,|box!green
4,4,they fired away and the picture turned out qui...,picture!+,
5,5,a few of my work constituants owned the g2 and...,picture quality!+,canon!recommended|g2!few|picture-quality!recom...
6,6,i 'm easily enlarging pictures to 8 1/2 x 11 w...,picture quality!+,setting!possible
7,7,"ensure you get a larger flash , 128 or 256 , s...",,flash!pinch|flash-card!flash|pinch!32mb
8,8,"bottom line , well made camera , easy to use ,...",camera!+|use!+|feature!+,line!bottom|flash!external|lense!external|abil...
9,9,i 'd highly recommend this camera for anyone w...,picture quality!+|use!+|option!+,camera!recommend|quality!excellent|combination...
