In [1]:
import json
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from sklearn.naive_bayes import BernoulliNB,MultinomialNB
from sklearn.cross_validation import train_test_split
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
import nltk

# read data

In [3]:
with open('data/Book_reviews50000.json') as data_file:  
    data = []
    for line in data_file:
        data.append(json.loads(line))
        

In [3]:
df = pd.DataFrame(data)
text = df['reviewText']
def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

In [4]:
y = df['overall']
y = y.map(partition)

# make a balanced subsample 

In [7]:
indices = y[y == 'negative'].index
neg_indices = y[y == 'negative'].index
random_indices1 = np.random.choice(indices, 5000, replace=False)
random_indices = np.random.choice(neg_indices, 5000, replace=False)
index = list(random_indices) + list(random_indices1)
sub_sample = df.loc[index]

In [8]:
text = sub_sample['reviewText']
y = sub_sample['overall']
y = y.map(partition)

In [9]:
text = text.reset_index(drop=True)

# split train and test data

In [10]:
X_train, X_test, y_train, y_test = train_test_split(text, y, test_size=0.2, random_state=1)

# main code for sentiment analysis

In [11]:
from pprint import pprint
import nltk
import yaml
import sys
import os
import re

class Splitter(object):

    def __init__(self):
        self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self, text):
        """
        input format: a paragraph of text
        output format: a list of lists of words.
            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
        """
        sentences = self.nltk_splitter.tokenize(text)
        tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
        return tokenized_sentences


class POSTagger(object):

    def __init__(self):
        pass
        
    def pos_tag(self, sentences):
        """
        input format: list of lists of words
            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
        output format: list of lists of tagged tokens. Each tagged tokens has a
        form, a lemma, and a list of tags
            e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])],
                    [('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]]
        """

        pos = [nltk.pos_tag(sentence) for sentence in sentences]
        lemmer = WordNetLemmatizer()
        #adapt format
        pos = [[(word, lemmer.lemmatize(word), [postag]) for (word, postag) in sentence] for sentence in pos]
        return pos

class DictionaryTagger(object):

    def __init__(self):
        positive_dct = {}
        negative_dct = {}
        lemmer = WordNetLemmatizer()
        # lemmatizer words in 2 lexicons and put them in 2 dict
        positive = pd.read_csv('positive-words.txt',header= None)
        for word in positive[0]:
            a = lemmer.lemmatize(word)
            positive_dct[a] = ['positive']
        negative = pd.read_csv('negative-words.txt',header= None)
        for word in negative[0]:
            try:
                b = lemmer.lemmatize(word)
                negative_dct[b] = ['negative']
            except UnicodeDecodeError:
                negative_dct[word] = ['negative']
        
        dictionaries = [positive_dct,negative_dct]
        
        # put 2 dicts above into a self.dictionary, key= words, value = neg or pos
        self.dictionary = {}
        self.max_key_size = 0
        for curr_dict in dictionaries:
            for key in curr_dict:
                if key in self.dictionary:
                    self.dictionary[key].extend(curr_dict[key])
                else:
                    self.dictionary[key] = curr_dict[key]
                    self.max_key_size = max(self.max_key_size, len(key))
       

    def tag(self, postagged_sentences):
        return [self.tag_sentence(sentence) for sentence in postagged_sentences]

    def tag_sentence(self, sentence, tag_with_lemmas=True):
        """
        the result is only one tagging of all the possible ones.
        The resulting tagging is determined by these two priority rules:
            - longest matches have higher priority
            - search is made from left to right
        """
        tag_sentence = []
        N = len(sentence)
        if self.max_key_size == 0:
            self.max_key_size = N
        i = 0
        while (i < N):
            j = min(i + self.max_key_size, N) #avoid overflow
            tagged = False
            while (j > i):
                expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
                expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
                if tag_with_lemmas:
                    literal = expression_lemma
                else:
                    literal = expression_form
                if literal in self.dictionary:
                    #self.logger.debug("found: %s" % literal)
                    is_single_token = j - i == 1
                    original_position = i
                    i = j
                    taggings = [tag for tag in self.dictionary[literal]]
                    tagged_expression = (expression_form, expression_lemma, taggings)
                    if is_single_token: #if the tagged literal is a single token, conserve its previous taggings:
                        original_token_tagging = sentence[original_position][2]
                        tagged_expression[2].extend(original_token_tagging)
                    tag_sentence.append(tagged_expression)
                    tagged = True
                else:
                    j = j - 1
            if not tagged:
                tag_sentence.append(sentence[i])
                i += 1
        return tag_sentence


def value_of(sentiment):
    if sentiment == 'positive': return 1
    if sentiment == 'negative': return -1
    return 0

def sentiment_score(review):
    #Getting a score for a review
    a = []
    for sentence in review:
        for token in sentence:
            for tag in token[2]:
                a.append(value_of(tag))
    return sum(a)




# Pipeline in a function

In [21]:
def basic_sent(review):
    splitter = Splitter()
    postagger = POSTagger()
    dicttagger = DictionaryTagger()
    splitted_sentences = splitter.split(review)
    pos_tagged_sentences = postagger.pos_tag(splitted_sentences)
    dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)
    score = sentiment_score(dict_tagged_sentences)
    if score >= 0:
        result = 'positive'
    else:
        result = 'negative'
    
    return result

# prediction 

In [24]:
pred = text.apply(lambda x: basic_sent(x))

In [16]:
from sklearn.metrics import accuracy_score

In [17]:
accuracy_score(pred,y)

0.63920792079207922