Read dataset

In [29]:
import sys, os
import re
import pandas as pd
import itertools, nltk, string 
#from transforms import flatten_deeptree

rx_dict = {
    'title': re.compile(r'\[t\](?P<title>.*)'),
    'review': re.compile(r'(?P<aspect>.*)##(?P<review>.*)')
}

def parse_data(file, data, reviews=[], aspects=[]):
    line = file.readline();
    if(line):
        match_title = rx_dict['title'].search(line);
        if match_title:
            data['title'].append(match_title.group('title'))
            data['domain'].append('canon g3')
            if(len(reviews) > 0 or len(aspects) > 0):
                data['review'].append("".join(reviews))
                data['aspect'].append(", ".join(aspects))
                reviews = []
        
        match_review = rx_dict['review'].search(line)
        if match_review:
            review_text = match_review.group('review');
            aspect_text = match_review.group('aspect');
            
            if(review_text):
                reviews.append(review_text);
            
            if(aspect_text):
                aspects.append(aspect_text);
                
        parse_data(file, data, reviews, aspects)
    else:
        if(len(reviews) > 0 or len(aspects) > 0):
            data['review'].append("".join(reviews))
            data['aspect'].append(", ".join(aspects))
    
data = {
    'title': [],
    'review': [],
    'aspect': [],
    'domain': []
}
    
def read_file():
    with open(os.path.join(os.path.abspath('dataset/bing_liu/') , 'Canon_G3.txt'), 'r') as file:
        parse_data(file, data);
        
positive_lexicon = []
negative_lexicon = []

def read_lexicon():
    global positive_lexicon;
    global negative_lexicon;
    
    with open(os.path.join(os.path.abspath('opinion-lexicon-English/') , 'positive-words.txt'), 'r') as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
         
        positive_lexicon = file.readlines()
    
    with open(os.path.join(os.path.abspath('opinion-lexicon-English/') , 'negative-words.txt'), 'r', encoding = "ISO-8859-1") as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
        
        negative_lexicon = file.readlines()
        
    positive_lexicon = list(map(lambda word: word.rstrip("\n\r"), positive_lexicon))
    negative_lexicon = list(map(lambda word: word.rstrip("\n\r"), negative_lexicon))
    
        
read_file()
read_lexicon()

Print dataset with pandas

In [61]:
xdata = pd.DataFrame(data)
len(xdata)

45

Loading Stanford CoreNLP

In [31]:
from nltk.parse.corenlp import CoreNLPDependencyParser

parser = CoreNLPDependencyParser()
parse = next(parser.raw_parse("my name is khan"))

In [32]:
list(parse.triples())

[(('khan', 'JJ'), 'nsubj', ('name', 'NN')),
 (('name', 'NN'), 'nmod:poss', ('my', 'PRP$')),
 (('khan', 'JJ'), 'cop', ('is', 'VBZ'))]

In [33]:
print(parse.to_conll(4))

my	PRP$	2	nmod:poss
name	NN	4	nsubj
is	VBZ	4	cop
khan	JJ	0	ROOT



In [34]:
xdata.iloc[0]['review']

"i recently purchased the canon powershot g3 and am extremely satisfied with the purchase . the camera is very easy to use , in fact on a recent trip this past week i was asked to take a picture of a vacationing elderly group . after i took their picture with their camera , they offered to take a picture of us . i just told them , press halfway , wait for the box to turn green and press the rest of the way . they fired away and the picture turned out quite nicely . ( as all of my pictures have thusfar ) . a few of my work constituants owned the g2 and highly recommended the canon for picture quality . i 'm easily enlarging pictures to 8 1/2 x 11 with no visable loss in picture quality and not even using the best possible setting as yet ( super fine ) . ensure you get a larger flash , 128 or 256 , some are selling with the larger flash , 32mb will do in a pinch but you 'll quickly want a larger flash card as with any of the 4mp cameras . bottom line , well made camera , easy to use , ve

In [35]:
def extract_candidate_chunks(text, grammar=r'''NP: {<NN.*><JJ>?<IN>?<PRP.*>?<NN.*>} 
                                            ...AP: {<JJ.*><.*>?<VB.*>+}'''):
    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents=[]
    for sent in nltk.sent_tokenize(text):
        tagged_sents.append(nltk.pos_tag(nltk.word_tokenize(sent)))
    
    #print(tagged_sents)
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda chunk: chunk[2] != 'O') if key]

    return [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]

print(extract_candidate_chunks(xdata.iloc[0]['review'], r'''NP: {<NN.*><JJ.*>?<PRP.*>?<NN>+}'''))
print(extract_candidate_chunks(xdata.iloc[0]['review'], r'''AP: {<JJ.*><.*>?<VB.*>+}'''))

for c in extract_candidate_chunks(xdata.iloc[0]['review'], r'''NP: {<NN.*><JJ.*>?<PRP.*>?<NN>+}'''):
        parse = next(parser.raw_parse(c))
        print(list(parse.triples()))

['canon powershot g3', 'week i', 'press halfway', 'picture quality', 'picture quality', 'lense /', 'job canon']
['easy to use', 'easy to use']
[(('g3', 'NN'), 'compound', ('canon', 'NN')), (('g3', 'NN'), 'compound', ('powershot', 'NN'))]
[(('i', 'FW'), 'compound', ('week', 'NN'))]
[(('press', 'NN'), 'advmod', ('halfway', 'RB'))]
[(('quality', 'NN'), 'compound', ('picture', 'NN'))]
[(('quality', 'NN'), 'compound', ('picture', 'NN'))]
[(('lense', 'NN'), 'punct', ('/', ':'))]
[(('canon', 'NN'), 'compound', ('job', 'NN'))]


In [36]:
import os
os.environ["CORENLP_HOME"] = r'C:\stanford-corenlp-full-2018-10-05'

import corenlp
client = corenlp.CoreNLPClient()

def chunk_check(text, word):
    try:
        pattern = '{tag:/NN.*/} <compound {word:'+ word +';tag:/NN.*/}'
        matches = client.semgrex(text, pattern)
        res = matches['sentences']
        if len(res) == 1:
            if res[0]['length'] == 0:
                return word
        return res[0]['0']['text'] + ' ' + word
    except:
        return word

In [37]:
from pycorenlp import StanfordCoreNLP
import json

nlp = StanfordCoreNLP('http://localhost:9000')
dependency_parser = nlp.annotate

def entity_check(sentence):
    res = sentence
    result = dependency_parser(sentence, properties={"outputFormat": "json", "annotators": "openie"})
    if len(result['sentences'][0]['openie']) != 0:
        s = result['sentences'][0]['openie'][0]['subject']
        o = result['sentences'][0]['openie'][0]['object']
        res = res.replace(s, '!'.join(s.split(' ')))
        res = res.replace(o, '!'.join(o.split(' ')))
    return res

entity_check('i recently purchased the canon powershot g3 and am extremely satisfied with the purchase .')

'i recently purchased the canon!powershot!g3 and am extremely satisfied with the purchase .'

## Double Propagation

### Rule 1.1 if a word A, whose POS is NN, is depended by an opinion word O through Dep, where Dep is one of the dependency relations amod, prep, nsubj, csubj, xsubj, dobj, and iobj, then A is an aspect.

### Rule 1.2 if an opinion word O and a word A, whose POS is NN, depend on a third word H through dependency relations Depi and Depj respectively, where Depi and Dep j are one of the relations amod, prep, nsubj, csubj, xsubj, dobj, and iobj, then A is an aspect.

### Rule 3.1 if a word Aj, whose POS is NN, directly depends on an aspect Ai through conj, then A j is an aspect

### Rule 3.2 if a word Aj, whose POS is NN, and an aspect Ai, directly depend on a third word H through the dependency relations De pi and De p j , where De pi and Depj are one of the relations in amod, prep, nsubj, csubj, xsubj, dobj, and conj, then A j is an aspect


### Rule 4.1 if a word Oj, whose POS is JJ, directly depends on an opinion word Oi through conj, then O j is an opinion word. 

### Rule 4.2 if a word Oj, whose POS is JJ, and an opinion word Oi, directly depend on a third word H through the dependance relations De pi and De p j , where Dep1 and Dep2 are one of the relations in amod, prep, nsubj, csubj, xsubj, dobj, and conj, then O j is an opinion word

In [38]:
# adding new dependencies for DP 'nmod' and'advmod'
dep_DP = ['amod', 'prep', 'nsubj', 'csubj', 'xsubj', 'dobj', 'iobj', 'advmod', 'nmod']
conj_DP = ['conj']

In [80]:
candidate_aspect = []
new_opinion = []
op_set = positive_lexicon + negative_lexicon

flag_a = 0
index = 0
# for text in xdata['review']:
chunking_noun = extract_candidate_chunks(text, r'NP: {<NN.*><JJ>?<.PRP*>?<NN.*>}')#{<NN.*|JJ>?<IN>?<PRP.*>?<NN.*>}""" 
chunking_adj = extract_candidate_chunks(text, r'AP: {<JJ.*><.*>?<VB.*>+}')

for r in df['review']:
    if index == 597:
        break
    new_chunking = []
    for c in chunking_noun:
        parse = next(parser.raw_parse(c))
        for (w1, dep, w2) in list(parse.triples()):
            if(dep == 'compound'):
                new_chunking.append(c)

    for c in chunking_adj:
        new_chunking.append(c)

    for chunk in new_chunking:
        r = r.replace(chunk, '!'.join(chunk.split(' ')))

    temp_index = index
#     r = entity_check(r)
    
    
    print(index)
    if model_svm.predict([features[index]])[0] == 0:
        index += 1
        continue
    else:
        print(r)
        index += 1


    parse = next(parser.raw_parse(r))
    #Rule 1.1
    for (w1, dep, w2) in list(parse.triples()):
        if(dep in dep_DP):
            # Rule 1.1
            if(w1[0] in op_set):
                if w2[1] == 'NN':
                    candidate_aspect.append(w2[0])
#                         candidate_aspect.append(chunk_check(r, w2[0]))
            elif(w2[0] in op_set):          
                if w1[1] == 'NN':
                    candidate_aspect.append(w1[0])
#                         candidate_aspect.append(chunk_check(r, w1[0]))


    # Rule 1.2
    for (w1, dep, w2) in parse.triples():
        if(dep in dep_DP):
            H = ''
            O = ''
            if w1[0] in op_set:
                H = w2[0]
                O = w1
            elif w2[0] in op_set:
                H = w1[0]
                O = w2

            if H:
                for (w1, dep, w2) in list(parse.triples()):
                    if w1[0] == H and w2[0] != O[0]:
                        if w2[1] == 'NN':
                            candidate_aspect.append(w2[0])
#                                 candidate_aspect.append(chunk_check(r, w2[0]))
                    elif w2[0] == H  and w1[0] != O[0]:
                        if w1[1] == 'NN':
                            candidate_aspect.append(w1[0])
#                                 candidate_aspect.append(chunk_check(r, w1[0]))


    # Rule 4.1
    for (w1, dep, w2) in list(parse.triples()):
        if(dep in conj_DP):
            if w1[0] in op_set:
                if w2[1] == 'JJ':
                    new_opinion.append((w2, w1))
                    op_set.append(w2[0])

            elif w2[0] in op_set:
                if w1[1] == 'JJ':
                    new_opinion.append((w1, w2))
                    op_set.append(w1[0])


    # Rule 4.2
    for (w1, dep, w2) in parse.triples():
        if(dep in dep_DP or dep in conj_DP):
            H = ''
            O = ''
            if w1[0] in op_set:
                H = w2[0]
                O = w1
            elif w2[0] in op_set:
                H = w1[0]
                O = w2

            if H:
                for (w1, dep, w2) in list(parse.triples()):
                    if w1[0] == H and w2[0] != O[0]:
                        if w2[1] == 'JJ':                  
                            new_opinion.append((w2, w1))
                            op_set.append(w2[0])
                    elif w2[0] == H  and w1[0] != O[0]:
                        if w1[1] == 'JJ':
                            new_opinion.append((w2, w1))
                            op_set.append(w1[0])


index = 0
for r in df['review']:
    if index == 597:
        break

    
    new_chunking = []
    for c in chunking_noun:
        parse = next(parser.raw_parse(c))
        for (w1, dep, w2) in list(parse.triples()):
            if(dep == 'compound'):
                new_chunking.append(c)

    for c in chunking_adj:
        new_chunking.append(c)

    for chunk in new_chunking:
        r = r.replace(chunk, '!'.join(chunk.split(' ')))

    temp_index = index
#     r = entity_check(r)
    

    if model_svm.predict([features[index]])[0] == 0:
        index += 1
        continue
    else:
        print(r)
        index += 1


    parse = next(parser.raw_parse(r))
    #Rule 3.1
    for (w1, dep, w2) in list(parse.triples()):
        if(dep in conj_DP):
            if(w1[0] in candidate_aspect): 
                if w2[1] == 'NN':
                    candidate_aspect.append(w2[0])
#                         candidate_aspect.append(chunk_check(r, w2[0]))
            elif(w2[0] in candidate_aspect):          
                if w1[1] == 'NN':
                    candidate_aspect.append(w1[0])
#                         candidate_aspect.append(chunk_check(r, w1[0]))


    # Rule 3.2
    for (w1, dep, w2) in parse.triples():
        if(dep in dep_DP or dep in conj_DP):
            H = ''
            O = ''
            if w1[0] in candidate_aspect:
                H = w2[0]
                O = w1
            elif w2[0] in candidate_aspect:
                H = w1[0]
                O = w2

            if H:
                for (w1, dep, w2) in list(parse.triples()):
                    if w1[0] == H and w2[0] != O[0]:
                        if w2[1] == 'NN':
                            candidate_aspect.append(w2[0])  
#                                 candidate_aspect.append(chunk_check(r, w2[0]))
                    elif w2[0] == H  and w1[0] != O[0]:
                        if w1[1] == 'NN':
                            candidate_aspect.append(w1[0])
#                                 candidate_aspect.append(chunk_check(r, w1[0]))


    # Rule 4.1
    for (w1, dep, w2) in list(parse.triples()):
        if(dep in conj_DP):
            if w1[0] in op_set:
                if w2[1] == 'JJ':
                    new_opinion.append((w2, w1))
                    op_set.append(w2[0])

            elif w2[0] in op_set:
                if w1[1] == 'JJ':
                    new_opinion.append((w1, w2))
                    op_set.append(w1[0])


    # Rule 4.2
    for (w1, dep, w2) in parse.triples():
        if(dep in dep_DP or dep in conj_DP):
            H = ''
            O = ''
            if w1[0] in op_set:
                H = w2[0]
                O = w1
            elif w2[0] in op_set:
                H = w1[0]
                O = w2

            if H:
                for (w1, dep, w2) in list(parse.triples()):
                    if w1[0] == H and w2[0] != O[0]:
                        if w2[1] == 'JJ':                  
                            new_opinion.append((w2, w1))
                            op_set.append(w2[0])
                    elif w2[0] == H  and w1[0] != O[0]:
                        if w1[1] == 'JJ':
                            new_opinion.append((w2, w1))
                            op_set.append(w1[0])
flag_a += 1

0
1
the camera is very easy to use , in fact on a recent trip this past week i was asked to take a picture of a vacationing elderly group . 
2
3
4
they fired away and the picture turned out quite nicely . ( as all of my pictures have thusfar ) . 
5
a few of my work constituants owned the g2 and highly recommended the canon for picture quality . 
6
i 'm easily enlarging pictures to 8 1/2 x 11 with no visable loss in picture quality and not even using the best possible setting as yet ( super fine ) . 
7
8
bottom line , well made camera , easy to use , very flexible and powerful features to include the ability to use external flash and lense / filters choices . 
9
i 'd highly recommend this camera for anyone who is looking for excellent quality pictures and a combination of ease of use and the flexibility to get advanced with many options to adjust if you like . 
10
11
12
13
14
just a little overview , powershot g3 is the flagship of canon 's powershot series and its an slr-like camera , 

140
141
142
i love this camera . 
143
144
took hundreds of pictures and they were great . 
145
great colors , pictures and white balance . 
146
has 4x optical zoom which is higher than any other in the same price range . 
147
148
maybe it is my lack of experience , but i found shots with this camera very disappointing . 
149
150
sure it had all the features , but when i tried to shoot a girl 's basketball game it just wan't up to the task . 
151
152
153
154
i found that low light situations combined with any sort of action left this camera in the dust . 
155
156
157
158
the grain was terrible . 
159
160
well flash photos are never great , and there was still a lot of noise . 
161
when i took outdoor photos with plenty of light and the objects were n't moving , the photos were awesome . 
162
163
164
165
166
167
168
169
it is versatile and seems to be one of the best . 
170
171
172
173
174
175
176
177
178
179
the two limitations i knew i would have to deal with are the very long lag time

366
367
this camera is worth every penny , and i highly recommend it ! 
368
another point of note : the battery life is incredible ! 
369
370
371
372
373
374
am i ever glad that i decided on this camera ! 
375
i 've only had it a week , but so far , everything about this camera is making me happy . 
376
forget the fact that it 'll probably take me a year to figure out all the features this camera has to offer . 
377
378
very intuitive menus are a big plus on this camera . 
379
380
the standard battery include with the g3 is a camcorder battery that will allow me to take pictures all day without worrying about charging . 
381
382
the optical zoom is awesome , and the viewfinder reflects the settings of the zoom . 
383
one little minor flaw with this camera is that the lens is visible in the viewfinder , but does not detract from the image at all . 
384
385
386
the best 4 megapixel/4x optical zoom camera available and fairly easy to use . 
387
highly recommended . 
388
389
390
391
becaus

587
simply , the canon g3 is the best digital camera out there today at this price point . 
588
589
590
the battery life of this camera is twice that of the nikon 5000 and is better than anything else i 've seen . 
591
the only minor nits i have with the camera are it is fairly boxy looking , it needs a wrist strap instead of a neck strap , and it is a bit slow between shots . 
592
even with these shortcomings , i still think it is the best digital camera available under $ 1200 . 
593
definetely a great camera . 
594
595
feels solid in hand . 
596
rather heavy for point and shoot but a great camera for semi pros . 
the camera is very easy to use , in fact on a recent trip this past week i was asked to take a picture of a vacationing elderly group . 
they fired away and the picture turned out quite nicely . ( as all of my pictures have thusfar ) . 
a few of my work constituants owned the g2 and highly recommended the canon for picture quality . 
i 'm easily enlarging pictures to 8 1/2 x

maybe it is my lack of experience , but i found shots with this camera very disappointing . 
sure it had all the features , but when i tried to shoot a girl 's basketball game it just wan't up to the task . 
i found that low light situations combined with any sort of action left this camera in the dust . 
the grain was terrible . 
well flash photos are never great , and there was still a lot of noise . 
when i took outdoor photos with plenty of light and the objects were n't moving , the photos were awesome . 
it is versatile and seems to be one of the best . 
the two limitations i knew i would have to deal with are the very long lag time before it focus-locks ( even in bright light ) , and the unsatisfactory light quality of the built-in flash . 
what makes the focus-lock lag time something i can live with is the extreme depth of field obtainable with these point and shoot digicams at f8 . 
relative to what i can easily achieve with external flash on my nikon slr , the internal flash 

one little minor flaw with this camera is that the lens is visible in the viewfinder , but does not detract from the image at all . 
the best 4 megapixel/4x optical zoom camera available and fairly easy to use . 
highly recommended . 
because this camera is beautiful . 
the g3 looks like a work of art ! 
its silver magnesium finish is stunning , and the sharp lines and excellent grip are better than any other camera i 've seen . 
i am very pleased with it so far . 
it is not perfect though . 
you can see the lens barrel in the view-finder . 
( i knew this before hand , and it is not that bad ) there is no tiff format . 
it seems to me that after the focus and metering are complete there is quite a lag before the shutter ? 
it is very simple to import via iphoto 2 and then move them to photoshop . 
i have not spent much time with the included software , so i don't know what to say about it other than it seems ok . 
it comes with a clearly written manual and the learning curve is not too

In [84]:
def calculate_precision_recall(aspect, target):
    tp = 0
#     for a in aspect: 
#         if a in target:
#             tp += 1
    for t in target: 
        for a in aspect:
            zz = t.split(' ')
            vv =0
            for z in zz:
                if z in a:
                    vv+=1
            if len(zz) == vv:
                tp+=1
                break
            
    P = (tp * 1.0) / (len(aspect) * 1.0)
    R = (tp * 1.0) / (len(target) * 1.0)
#     print(P, len(aspect))
#     print(R, len(target))
    
    f1 = 2.0 * P * R / (P+R)
    
    return P, R, f1

In [85]:
import re

target = []
for t in xdata['aspect']:
    for s in t.split(', '):
        for x in s.split(','):
            jj = re.sub(r'\[[+|-]\d\]', '',x)
            jjs = re.sub(r'\[\w\]', '',jj)
            if(jjs):
                target.append(jjs);
candidate_aspect = list(map(lambda aspect: ' '.join(aspect.split('!')), candidate_aspect))
# candidate_aspect = list(map(lambda aspect: aspect), candidate_aspect))

In [86]:
calculate_precision_recall(list(dict.fromkeys(candidate_aspect)), list(dict.fromkeys(target)))

(0.2102803738317757, 0.42452830188679247, 0.28125)

In [87]:
calculate_precision_recall(candidate_aspect, target)

(6.689149560117302, 0.6567808810826374, 1.1961195595175667)

In [70]:
for a in list(dict.fromkeys(candidate_aspect)):
    if len(a.split(" ")) > 1:
#     if a == 'camera':
        print(a)
#     print(a)

little display panel
at least
my canon
camera good
wish canon
lcd screen
many people
my g3
tae kwon
round deal
camera experience
better strap
full charge
new camera
canon g3
my wife
leica minilux
connector cord
second shutter sync
good month
many digital camera review sites out there
her main focus
ability as
my advice
design flaw
different type
required extension
proven canon


In [20]:
# for a in list(dict.fromkeys(target)):
#     if len(a.split(' ')) > 1:
#         print(a)

In [18]:
# print(list(dict.fromkeys(candidate_aspect)))

In [19]:
# t = 'i recently purchased the canon powershot g3 and am extremely satisfied with the purchase'
# s = 'i'
# o = 'canon powershot g3'

# print(t.replace(o, '-'.join(o.split(' '))))

In [23]:
for a in xdata['aspect'][0].split(', '):
    print(re.sub(r'\[[+|-]\d\]', '',a))
    
# print(xdata['aspect'][0])

canon powershot g3
use
picture
picture quality
picture quality
camera
use
feature
picture quality
use
option


In [5]:
import re
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import wordnet as wn
# from IPython.display import clear_outputfrom 
from nltk.stem import PorterStemmer
porter = PorterStemmer()
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def preprocessing(semua_kalimat):
    i=0
    kalimat_semua = []
    panjang = len(semua_kalimat) - 1
    for sentence in semua_kalimat:
        kalimat = []
        for word in nltk.word_tokenize(sentence):
            kata = wordnet_lemmatizer.lemmatize(word)
            if re.match(r'^[0-9]+$', kata) != None:
                kata = 'Num'
            kalimat.append(kata)
        kalimat_semua.append(' '.join(kalimat))
#         print(sentence)        
#         clear_output(wait=True)
#         print((i/panjang)*100,"%")
        i+=1
    return kalimat_semua

# input file
df = pd.read_csv("datasettrain.csv")
# preprocess
semkal = preprocessing(df['review'])
labels = df["target"]

In [6]:
# settings tf-idf
tfidf = TfidfVectorizer(sublinear_tf=False, analyzer='word', ngram_range=(1,2))

# tf-idf
features = tfidf.fit_transform(semkal).toarray()
features_name = tfidf.vocabulary_
print(len(features_name))

33503


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.30, random_state=42)

''' The important part '''
# ''' SVM classifier ''' 
# model
model_svm = LinearSVC()
# training
model_svm.fit(X_train, y_train)
# predict / testing
pred = model_svm.predict(X_test)

from sklearn.metrics import precision_score
from sklearn.metrics import classification_report

print(precision_score(y_test, pred, average='micro'))
print(classification_report(y_test, pred))

0.7364864864864865
              precision    recall  f1-score   support

           0       0.75      0.81      0.78       670
           1       0.72      0.64      0.68       514

   micro avg       0.74      0.74      0.74      1184
   macro avg       0.73      0.73      0.73      1184
weighted avg       0.74      0.74      0.73      1184



In [27]:
print(model_svm.predict([features[0]])[0])

0


In [17]:
semkal[0]

'i recently purchased the canon powershot g3 and am extremely satisfied with the purchase .'

In [63]:
len(df['review'])

3944

In [75]:
df['review'][596]

'rather heavy for point and shoot but a great camera for semi pros . '