In [1]:
from __future__ import division
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup
import nltk
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
import numpy as np
import pickle
from nltk.corpus import stopwords


def loadMovieReviews():
    df = pd.read_csv('IMDB Reviews.csv')
    return df['Reviews'], df['Sentiments']

In [2]:
def analyze(clf, vectorizer):
    names = np.asarray(vectorizer.get_feature_names())
    w = np.argsort(clf.coef_.squeeze())

    #the most positives features have big positive values
    print ('Most Positive sentiment words', np.asarray(names)[w[-10:]])
    # the most negatives features have big negative values
    print ('Most Negative sentiment words', np.asarray(names)[w[:10]])
    # Most unusefull words have values around zero
    print ('Most unusefull words', names[np.argsort(np.abs(clf.coef_.squeeze()))[:10]])

In [3]:
def model(revTrain, revTest, sentTrain, sentTest, ngram_range):
    tokenizer = nltk.RegexpTokenizer(r'[a-zA-Z]{2,}')               # extact only words of two characters and more
    vectorizer = TfidfVectorizer(analyzer="word",
                                 # remove HTML tags and convert words to lower case
                                 preprocessor=lambda w: BeautifulSoup(w, 'lxml').get_text().lower(),
                                 tokenizer=tokenizer.tokenize,
                                 stop_words=nltk.corpus.stopwords.words('english'),  # words to be removed
                                 lowercase=False,  # not need it as we already convert them
                                 ngram_range=ngram_range,  # unigram
                                 min_df=2,              #eleminiate words that only apear in one review
                                 max_df=int(80.0 * len(revTrain) / 100), #eleminiate words that apear most of the reviews
                                 )

    matTrain = vectorizer.fit_transform(revTrain)       #convert reviews to matrix if numeric values
    matTest = vectorizer.transform(revTest)             #do the same for test
    clf = LinearSVC()
    clf.fit(matTrain, sentTrain)                        #train the data with linear SVM
    print('Test accuracy {0}'.format(clf.score(matTest, sentTest)))
    analyze(clf, vectorizer)
    with open('svm.pkl', 'wb') as f:
        pickle.dump(clf,f)
    with open('vectorizer.pkl', 'wb') as f:
        pickle.dump((vectorizer.vocabulary_, vectorizer.idf_),f)

In [5]:
reviews, sentiments = loadMovieReviews()
revTrain, revTest, sentTrain, sentTest = train_test_split(reviews, sentiments, test_size=0.15, random_state=0)

In [6]:
model(revTrain, revTest, sentTrain, sentTest, (1,1)) #uni-gram

Test accuracy 0.8981333333333333
Most Positive sentiment words ['enjoyable' 'hilarious' 'best' 'wonderful' 'funniest' 'refreshing'
 'amazing' 'perfect' 'great' 'excellent']
Most Negative sentiment words ['worst' 'waste' 'awful' 'disappointment' 'boring' 'fails' 'terrible'
 'poor' 'bad' 'disappointing']
Most unusefull words ['caspar' 'amelio' 'kep' 'tactically' 'reefs' 'digusting' 'kester' 'keusch'
 'reductivist' 'redlitch']


In [7]:
model(revTrain, revTest, sentTrain, sentTest, (1, 2)) #bi-gram

Test accuracy 0.9142666666666667
Most Positive sentiment words ['favorite' 'today' 'best' 'hilarious' 'brilliant' 'wonderful' 'amazing'
 'perfect' 'great' 'excellent']
Most Negative sentiment words ['worst' 'awful' 'boring' 'bad' 'waste' 'terrible' 'poor' 'disappointment'
 'nothing' 'dull']
Most unusefull words ['huge responsibility' 'trappist monk' 'overall please'
 'given unfortunately' 'trappist' 'appearance cute' 'films hey' 'demonico'
 'crap said' 'unendingly']


In [11]:
import sys
# import sqlite3
# import time
# import ssl
# import urllib
# from urlparse import urljoin
# from urlparse import urlparse
import re
from datetime import datetime, timedelta
import string
import zlib
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from bs4 import BeautifulSoup
import nltk
import scipy.sparse as sp
import pandas as pd
from collections import OrderedDict
from datetime import date

imdb = pd.read_csv('Black Panther.csv', encoding='latin-1')


class MyVectorizer(TfidfVectorizer):
    '''
    it is dummy class to workaround sklearn limitation
    '''
    def setIDF(self, idf):
        TfidfVectorizer.idf_ =idf

class SentimentAnalyzer:
    def __init__(self):
        with open('svm.pkl','rb') as f:
            self.clf = pickle.load(f)

        tokenizer = nltk.RegexpTokenizer(r'[a-zA-Z]{2,}')  # extact only words of two characters and more
        self.vectorizer = MyVectorizer(analyzer="word",
                                 # remove HTML tags and convert words to lower case
                                 preprocessor=lambda w: BeautifulSoup(w, 'lxml').get_text().lower(),
                                 tokenizer=tokenizer.tokenize,
                                 stop_words=nltk.corpus.stopwords.words('english'),  # words to be removed
                                 lowercase=False,  # not need it as we already convert them
                                 ngram_range=(1, 2),  # unigram
                                 )
        with open('vectorizer.pkl','rb') as f:
            voc, idf = pickle.load(f)
            self.vectorizer.vocabulary_ = voc
            self.vectorizer.setIDF(idf)
            self.vectorizer._tfidf._idf_diag = sp.spdiags(idf,      #again this to workaround sklearn limitation
                                                     diags=0,
                                                     m=len(idf),
                                                     n=len(idf))

    def analyzes(self, sentences):
        '''
        analyze more than one sentece
        :return: the sentiment 1=positive, 0=negative
        :param sentences: array of sentences
        '''
        return self.clf.predict(self.vectorizer.transform(sentences))

    def analyze(self,sentence):
        '''
                analyze just one sentece
                :return: the sentiment 1=positive, 0=negative
                :param sentence: a string
        '''
        return self.analyzes([sentence])[0]


In [13]:
SentimentAnalyzer().analyze('Sheldon too arrogant to be an actor')

0

In [14]:
SentimentAnalyzer().analyze('Fares is not the best actor in the world')

1

In [15]:
SentimentAnalyzer().analyze('I would rather spending my money on pizza rather than this')

0