## Data cleaning

In [28]:
import csv
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn import cross_validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn import svm
from nltk.stem import WordNetLemmatizer
from nltk.tag.stanford import StanfordPOSTagger
from nltk.parse.stanford import StanfordDependencyParser

with open('dara 1_train.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    """
    for row in reader:
        print(row['example_id'])
        print(row[' text'])
        print(row[' aspect_term'])
        print(row[' term_location'])
        print(row[' class'])
        print()        
    """
    happy_emoji = [':‑)',':)',':-]',':]',':-3',':3',':->',':>','8-)','8)',':-}', ':}',':o)',':c)',':^)','=]','=)',
                  ':‑D',':D','8‑D','8D','x‑D','xD','X‑D','XD','=D','=3','B^D',':-))']
                
    sad_emoji = [':‑(',':(',':‑c',':c',':‑<',':<',':‑[',':[',':-||','>:[',':{',':@','>:(']
    
    negative_words = ['doesn’t', 'isn’t', 'wasn’t', 'shouldn’t', 'wouldn’t', 'couldn’t', 'won’t', 'can’t', 'don’t']
    
    tokenizer = RegexpTokenizer(r'\w+')
    
    stop_words = set(stopwords.words('english'))
    
    singles = []
    texts = []
    tempText = ''
    classes = []
    aspectTerm = []
    
    for row in reader:
        aspectTerm.append(tokenizer.tokenize(row[' aspect_term']))
        
        classes.append(row[' class'])
        tempText = row[' text'].replace('[comma]',',').lower()
               
        for hem in happy_emoji:
            if hem in tempText:
                tempText = tempText.replace(hem, 'emojihappy')
        
        for sem in sad_emoji:
            if sem in tempText:
                tempText = tempText.replace(sem, 'emojisad')
                
        for negw in negative_words:
            if negw in tempText:
                tempText = tempText.replace(negw, 'not')
        
        singles.append(tokenizer.tokenize(tempText))
        
        texts.append(tempText)
        
    newTokenList = []
    
    '''
    # stemming
    stemmer = SnowballStemmer("english")
        
    for tokenList in singles:
        newToken = []
        for token in tokenList:
            if token not in stop_words:
                newToken.append(stemmer.stem(token))
            if token == "not":
                newToken.append(stemmer.stem(token))
        newTokenList.append(newToken)
    '''
    
    # lemmatize
    wnl = WordNetLemmatizer()
    
    for tokenList in singles:
        newToken = []
        for token in tokenList:
            if token not in stop_words:
                newToken.append(wnl.lemmatize(token))
        newTokenList.append(newToken)
    
    # lemmatize aspect term
    newApectTermList =[]
    
    for asp in aspectTerm:
        newAT = []
        for ap in asp:
            newAT.append(wnl.lemmatize(ap))
        newApectTermList.append(newAT)
        
    '''
    unigrams = []
    
    for single in singles:
        unigrams.append(ngrams(single, 1))
    '''
    # new review text
    newText = []
    
    for tk in newTokenList:
        newText.append(' '.join(tk))
        
    # new aspect term
    newAspectTerm = []
    
    for apt in newApectTermList:
        newAspectTerm.append(' '.join(apt))
        
    '''
    kf = KFold(n_splits=10)
    
    
    #vectorizer = TfidfVectorizer(use_idf=True)
    count_vectorizer = CountVectorizer()
    data_tfidf = count_vectorizer.fit_transform(newText)
    tfidf_data = TfidfTransformer(use_idf=False).fit_transform(data_tfidf)
    '''
    
    # POS tagging
    st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
    
    textPOS = []
    
    for nt in newText:
        textPOS.append(st.tag(nt.split()))
        
    # dependency parsing
    dep_parser=StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")

    depParsingList = []
    
    for nt in newText:
        result = dep_parser.raw_parse(nt)    
        dep = result.__next__()    
        depParsingList.append(list(dep.triples()))
    

In [24]:
#experiment
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
print(wnl.lemmatize('features'))

feature


In [8]:
#data_train,data_test,target_train,target_test = cross_validation.train_test_split(tfidf_data,classes,test_size=0.4,random_state=43)
#classifier = BernoulliNB().fit(data_train,target_train)
#predicted = classifier.predict(data_test)
#evaluate_model(target_test,predicted)
from sklearn import metrics
from sklearn.metrics import classification_report
clf = svm.SVC(kernel='linear', C=1, random_state=0)
predicted = cross_val_predict(clf, tfidf_data, classes, cv=10)



print(classification_report(classes,predicted))
print("The accuracy score is {:.2%}".format(metrics.accuracy_score(classes, predicted)))

             precision    recall  f1-score   support

         -1       0.70      0.80      0.75       828
          0       0.58      0.39      0.46       436
          1       0.77      0.79      0.78       939

avg / total       0.71      0.71      0.70      2203

The accuracy score is 71.45%


In [2]:
#experiment
#from nltk.tag.stanford import CoreNLPPOSTagger
#CoreNLPPOSTagger(url='http://localhost:9000').tag('What is the airspeed of an unladen swallow ?'.split())

#import nltk
#nltk.pos_tag(newTokenList[0])

from nltk.tag.stanford import StanfordPOSTagger
st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
st.tag('What is the airspeed of an unladen swallow ?'.split())

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordPOSTagger, self).__init__(*args, **kwargs)


[('What', 'WP'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('airspeed', 'NN'),
 ('of', 'IN'),
 ('an', 'DT'),
 ('unladen', 'JJ'),
 ('swallow', 'VB'),
 ('?', '.')]

In [36]:
#experiment
from nltk.parse.stanford import StanfordDependencyParser
dep_parser=StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
print([parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")])
result = dep_parser.raw_parse("The pizza at the restaurant was very good.")
dep = result.__next__()
list(dep.triples())

[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])]


[(('good', 'JJ'), 'nsubj', ('pizza', 'NN')),
 (('pizza', 'NN'), 'det', ('The', 'DT')),
 (('pizza', 'NN'), 'nmod', ('restaurant', 'NN')),
 (('restaurant', 'NN'), 'case', ('at', 'IN')),
 (('restaurant', 'NN'), 'det', ('the', 'DT')),
 (('good', 'JJ'), 'cop', ('was', 'VBD')),
 (('good', 'JJ'), 'advmod', ('very', 'RB'))]

In [3]:
#experiment
import os
os.environ['STANFORD_MODELS'] + 

'D:\\Important\\NLP\\stanford-postagger-full-2018-02-27\\models;D:\\Important\\NLP\\stanford-parser-full-2018-02-27;D:\\Important\\NLP\\stanford-ner-2018-02-27\\classifiers;'

In [31]:
#experiment
#from collections import Counter
'''
for text in texts:
    print('\n'+text)
'''
'''
for sss in singles:
    print(sss)
'''
'''
for tk in newTokenList:
    print(' '.join(tk))
'''
#print(newTokenList[0])
'''
for tk in newTokenList:
    print("\n",tk)
'''

'''
#print(Counter(unigrams))

for grams in unigrams:
    #print(Counter(grams))
    for g in grams:
         print(g)
'''

#len(classes)
'''
for nt in newText:
    print("\n",nt)
'''

'''
for tokenList in singles:
        for token in tokenList:
            print(token)
        print("\n")
'''

'''
for at in newAspectTerm:
    print("\n",at)
'''


 human interface

 every day computing

 mouse command button

 right speaker

 DELL Customer Service

 Windows Vista

 hard drive

 specification

 setup

 Safari internet browser

 cost

 bluray player

 touchpad

 price

 work

 battery

 warranty

 charge

 3G network

 system

 screen

 warranty

 internal hard disk

 cd drive

 Vista

 brand

 delivery

 service

 affordability

 delivery service

 power

 included program

 Vista

 charger

 warranty

 run

 DVD burner

 Windows update

 keyboard

 webpage

 power

 display

 navigate

 delete key

 mouse pad

 performance

 application

 costing

 sound

 display

 keyboard

 graphic

 battery life

 price

 use

 cordless mouse

 shipping

 SERVICE

 stand

 acer arcade

 price

 external dvd drive

 Windows

 mouse

 keyboard

 tech store

 battery

 carry

 photo booth

 iBook

 feature

 use

 browser

 touchpad

 price

 wireless switch

 playing game

 screen

 iChat

 internet

 shortcut

 program

 battery life

 memor


 RAM slot

 window xp

 Bluetooth

 portability

 company

 hard disc

 motherboard

 program

 battery life

 performance

 screen size

 gray color

 feature

 service

 multiple page viewer

 office

 use

 memory

 application

 Core2 Quad

 use

 Apple application

 edge

 quad core I7

 price

 rubber enclosure

 recovery disc

 performs

 program

 AMD Turin processor

 word processor

 set up

 battery

 size

 internet interface

 power supply

 charge

 screen

 appearance

 iLife software

 feature

 longevity

 port

 fan

 battery

 headphone

 find file

 web browsing

 system

 run

 stability

 movie making

 hard drive

 work

 screen size

 music

 work

 battery

 movie playing

 12 cell battery

 boot up

 touch control button

 Safari

 Windows XP driver

 picture quality

 Windows

 shut down

 load

 system

 feature

 program

 use

 power supply

 Windows 7

 use

 15 inch

 starting up time

 screen

 battery

 software

 KEYBOARD FUNCTION

 boot

 quality

 

In [43]:
# big experiment

import csv

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn import cross_validation
from sklearn.metrics import classification_report
import numpy as np
from sklearn.metrics import accuracy_score

# review.csv contains two columns
# first column is the review content (quoted)
# second column is the assigned sentiment (positive or negative)
def load_file():
    with open('review.csv') as csv_file:
        reader = csv.reader(csv_file,delimiter=",",quotechar='"')
        next(reader)
        data =[]
        target = []
        for row in reader:
            # skip missing data
            if row[0] and row[1]:
                data.append(row[0])
                target.append(row[1])

        return data,target

# preprocess creates the term frequency matrix for the review data set
def preprocess():
    data,target = load_file()
    count_vectorizer = CountVectorizer(binary='true')
    data = count_vectorizer.fit_transform(data)
    tfidf_data = TfidfTransformer(use_idf=False).fit_transform(data)

    return tfidf_data

def learn_model(data,target):
    # preparing data for split validation. 60% training, 40% test
    data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.4,random_state=43)
    classifier = BernoulliNB().fit(data_train,target_train)
    predicted = classifier.predict(data_test)
    evaluate_model(target_test,predicted)

# read more about model evaluation metrics here
# http://scikit-learn.org/stable/modules/model_evaluation.html
def evaluate_model(target_true,target_predicted):
    print(classification_report(target_true,target_predicted))
    print("The accuracy score is {:.2%}".format(accuracy_score(target_true,target_predicted)))

def main():
    data,target = load_file()
    tf_idf = preprocess()
    learn_model(tf_idf,target)


main()


UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 6223: character maps to <undefined>

In [None]:
#experiment

with open('dara 1_train.csv', newline='') as csvfile2:
    reader2 = csv.DictReader(csvfile2)
    
    data2,target2 = load_file()
                count_vectorizer = CountVectorizer(binary='true')
                data = count_vectorizer.fit_transform(data)
                tfidf_data = TfidfTransformer(use_idf=False).
                fit_transform(data)

                return tfidf_data

In [3]:
#experiment
exps = ['asd','asd_','_wqeqwe']
expsf = []
for ex in exps:
    expsf.append(ex.replace('_',''))
print(expsf)

['asd', 'asd', 'wqeqwe']


In [4]:
#experiment
lhap = [':)',':P',':D']
lsad = [':(',':|']
t1 = "i am happy :), but he is sad :|"
t2 = "i am sad :(, he is :P"
t1.replace(lhap,'smilehappy')
t1.replace(lsad,'smilesad')
t2.replace(lhap,'smilehappy')
t2.replace(lsad,'smilesad')
print(t1)
print(t2)

TypeError: replace() argument 1 must be str, not list

In [21]:
#experiment

happy_emoji = [':‑)',':)',':-]',':]',':-3',':3',':->',':>','8-)','8)',':-}', ':}',':o)',':c)',':^)','=]','=)',
                  ':‑D',':D','8‑D','8D','x‑D','xD','X‑D','XD','=D','=3','B^D',':-))']
                
sad_emoji = [':‑(',':(',':‑c',':c',':‑<',':<',':‑[',':[',':-||','>:[',':{',':@','>:(']

strr = "i am happy :)"
for hem in happy_emoji:
    if hem in strr:
        strr = strr.replace(hem, 'emojihappy')

for sem in sad_emoji:
    strr.replace(sem, 'emojisad')
    
print(strr)


i am happy emojihappy
