In [209]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
import nltk
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from nltk.corpus import gutenberg, stopwords
from pandas.io.json import json_normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

# Sentiment analysis

In [210]:
yelp_reviews = pd.read_csv('yelp_labelled.txt', delimiter='\t')
amz_reviews = pd.read_csv('amazon_cells_labelled.txt', delimiter='\t')
imdb_reviews = pd.read_csv('imdb_labelled.txt', delimiter='\t')

#cleaning imdb reviews. There is extra white space at the end of every review.
imdb_reviews['review'] = imdb_reviews['review'].apply(lambda x: x.strip())

#classifying review websites
yelp_reviews['website'] = 0
amz_reviews['website'] = 1
imdb_reviews['website'] = 2

reviews_df = pd.concat([yelp_reviews, amz_reviews, imdb_reviews])
reviews_df = reviews_df.set_index(np.arange(len(reviews_df)))

In [211]:
reviews_df

Unnamed: 0,review,sentiment,website
0,Wow... Loved this place.,1,0
1,Crust is not good.,0,0
2,Not tasty and the texture was just nasty.,0,0
3,Stopped by during the late May bank holiday of...,1,0
4,The selection on the menu was great and so wer...,1,0
...,...,...,...
2743,I just got bored watching Jessice Lange take h...,0,2
2744,"Unfortunately, any virtue in this film's produ...",0,2
2745,"In a word, it is embarrassing.",0,2
2746,Exceptionally bad!,0,2


In [212]:
nlp = spacy.load('en')

In [213]:
reviews_doc = ''

for review in reviews_df['review']:
    reviews_doc += review + ' '

reviews_doc = reviews_doc.strip()

reviews_doc = nlp(reviews_doc)

In [214]:
def bow(text):
    all_words = [token.lemma_ for token in text if token.is_punct == False and token.is_stop == False]
    
    return [item[0] for item in Counter(all_words).most_common() if item[1] > 2]

def bop(text):
    all_words = [token.lemma_ for token in text if token.is_punct == True]
    
    return [item[0] for item in Counter(all_words).most_common() if item[1] > 1]

In [215]:
reviews_df['review'] = reviews_df['review'].apply(lambda x: nlp(x))

# Creating features using BoW plus other features

In [216]:
def features(reviews, tfidf=False):
    
    #common words and puncuation in all the reviews
    common_words = bow(reviews_doc)
    common_punct = bop(reviews_doc)
    
    df= pd.DataFrame()
    
    df['review'] = reviews['review']
    
    if tfidf == False:
        
        def words(review):

            words = [token.lemma_ for token in review
                     if token.is_punct == False
                     and token.is_stop == False
                     and token.lemma_ in common_words]
            return Counter(words)

        df['words'] = df['review'].apply(lambda x: words(x))
        
        word_data = list(df['words'])
        word_data = pd.DataFrame(json_normalize(word_data))
        def rename_cols(x):
            return '{}_word'.format(x)
        word_data = word_data.rename(lambda x: rename_cols(x), axis='columns')
    
    else:
    
        reviews_list = []

        for review in df['review']:
            reviews_list.append(review.text)

        vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                                     min_df=30, # only use words that appear at least three times
                                     stop_words='english', 
                                     lowercase=True, #convert everything to lower case
                                     use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                                     norm=u'l2', #Applies a correction factor so that longer reviews and shorter reviews 
                                                 #get treated equally
                                     smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that
                                                    #used every word once.Prevents divide-by-zero errors
                                    )

        #Applying the vectorizer
        reviews_tfidf = vectorizer.fit_transform(reviews_list)

        #Reshapes the vectorizer output into something people can read
        reviews_csr = reviews_tfidf.tocsr()

        #number of reviews
        n = reviews_csr.shape[0]
        #A list of dictionaries, one per paragraph
        tfidf_by_review = [{} for _ in range(0,n)]
        #List of features
        terms = vectorizer.get_feature_names()
        #for each paragraph, lists the feature words and their tf-idf scores
        for i, j in zip(*reviews_csr.nonzero()):
            tfidf_by_review[i][terms[j]] = reviews_csr[i, j]
            
        df['words'] = tfidf_by_review
        word_data = pd.DataFrame(json_normalize(tfidf_by_review))
        def rename_cols(x):
            return '{}_tfidf'.format(x)
        word_data = word_data.rename(lambda x: rename_cols(x), axis='columns')
    
    def pos(review):
        pos = [token.pos_ for token in review if token.is_punct == False and token.is_stop == False]
        return Counter(pos)
    
    df['pos'] = df['review'].apply(lambda x: pos(x))
    
    pos_data = list(df['pos'])
    pos_data = pd.DataFrame(json_normalize(pos_data))
    def rename_cols(x):
        return '{}_pos'.format(x)
    pos_data = pos_data.rename(lambda x: rename_cols(x), axis='columns')
    
    
    def dep(review):
        dep = [token.dep_ for token in review if token.is_punct == False and token.is_stop == False]
        return Counter(dep)
    
    df['dependencies'] = df['review'].apply(lambda x: dep(x))
    
    dep_data = list(df['dependencies'])
    dep_data = pd.DataFrame(json_normalize(dep_data))
    def rename_cols(x):
        return '{}_dep'.format(x)
    dep_data = dep_data.rename(lambda x: rename_cols(x), axis='columns')
    
    def punct(review):
        punct = [token.text for token in review if token.is_punct == True and token.lemma_ in common_punct]
        return Counter(punct)
    
    df['punct'] = df['review'].apply(lambda x: punct(x))
    
    punct_data = list(df['punct'])
    punct_data = pd.DataFrame(json_normalize(punct_data))
    def rename_cols(x):
        return '{}_punct'.format(x)
    punct_data = punct_data.rename(lambda x: rename_cols(x), axis='columns')
    
    df['review_length'] = df['review'].apply(lambda x: len(x))
    
    analyzer = SentimentIntensityAnalyzer()
    df['compound_sentiment_score'] = df['review'].apply(lambda x: analyzer.polarity_scores(x.text)['compound'])
    df['pos_sentiment_score'] = df['review'].apply(lambda x: analyzer.polarity_scores(x.text)['pos'])
    df['neg_sentiment_score'] = df['review'].apply(lambda x: analyzer.polarity_scores(x.text)['neg'])
    
    df = pd.concat([df, word_data, pos_data, dep_data, punct_data], axis=1)
    
    df['sentiment'] = reviews['sentiment']
    df['website'] = reviews['website']
    
    df.fillna(0, inplace=True)
    
    return df

In [217]:
model_features = features(reviews_df, tfidf=False)
model_features.head()

Unnamed: 0,review,words,pos,dependencies,punct,review_length,compound_sentiment_score,pos_sentiment_score,neg_sentiment_score,wow_word,...,%_punct,*_punct,?_punct,:)_punct,--_punct,'_punct,......_punct,#_punct,sentiment,website
0,"(Wow, ..., Loved, this, place, .)","{'wow': 1, 'love': 1, 'place': 1}","{'INTJ': 1, 'VERB': 1, 'NOUN': 1}","{'intj': 1, 'ROOT': 1, 'dobj': 1}","{'...': 1, '.': 1}",6,0.5994,0.565,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
1,"(Crust, is, not, good, .)",{'good': 1},"{'NOUN': 1, 'ADJ': 1}","{'nsubj': 1, 'acomp': 1}",{'.': 1},5,-0.3412,0.0,0.445,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2,"(Not, tasty, and, the, texture, was, just, nas...","{'tasty': 1, 'texture': 1, 'nasty': 1}","{'ADJ': 2, 'NOUN': 1}","{'nsubj': 1, 'conj': 1, 'acomp': 1}",{'.': 1},9,-0.5574,0.0,0.34,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3,"(Stopped, by, during, the, late, May, bank, ho...","{'stop': 1, 'late': 1, 'recommendation': 1, 'l...","{'VERB': 2, 'ADJ': 1, 'NOUN': 3, 'PROPN': 2}","{'ROOT': 1, 'amod': 1, 'compound': 3, 'pobj': ...",{'.': 1},16,0.6908,0.322,0.093,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
4,"(The, selection, on, the, menu, was, great, an...","{'selection': 1, 'menu': 1, 'great': 1, 'price...","{'NOUN': 3, 'ADJ': 1}","{'nsubj': 2, 'pobj': 1, 'acomp': 1}",{'.': 1},13,0.6249,0.272,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0


In [218]:
X = model_features.drop(['review', 'sentiment', 'words', 'pos', 'punct', 'dependencies', 'website'], axis=1)
Y = model_features['sentiment']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=0)

In [219]:
lr = LogisticRegression(penalty='l2')

lr.fit(X_train, Y_train)

print('train set score: {}'.format(lr.score(X_train, Y_train)))
print('test set score: {}'.format(lr.score(X_test, Y_test)))

train set score: 0.8971792538671519
test set score: 0.8363636363636363


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [220]:
nb = BernoulliNB()

nb.fit(X_train, Y_train)

print('train set score: {}'.format(nb.score(X_train, Y_train)))
print('test set score: {}'.format(nb.score(X_test, Y_test)))

train set score: 0.8685168334849863
test set score: 0.8290909090909091


# Creating features using tfidf plus other features

In [221]:
model_features = features(reviews_df, tfidf=True)
model_features.head()

Unnamed: 0,review,words,pos,dependencies,punct,review_length,compound_sentiment_score,pos_sentiment_score,neg_sentiment_score,place_tfidf,...,%_punct,*_punct,?_punct,:)_punct,--_punct,'_punct,......_punct,#_punct,sentiment,website
0,"(Wow, ..., Loved, this, place, .)",{'place': 1.0},"{'INTJ': 1, 'VERB': 1, 'NOUN': 1}","{'intj': 1, 'ROOT': 1, 'dobj': 1}","{'...': 1, '.': 1}",6,0.5994,0.565,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
1,"(Crust, is, not, good, .)",{'good': 1.0},"{'NOUN': 1, 'ADJ': 1}","{'nsubj': 1, 'acomp': 1}",{'.': 1},5,-0.3412,0.0,0.445,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2,"(Not, tasty, and, the, texture, was, just, nas...",{'just': 1.0},"{'ADJ': 2, 'NOUN': 1}","{'nsubj': 1, 'conj': 1, 'acomp': 1}",{'.': 1},9,-0.5574,0.0,0.34,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3,"(Stopped, by, during, the, late, May, bank, ho...",{},"{'VERB': 2, 'ADJ': 1, 'NOUN': 3, 'PROPN': 2}","{'ROOT': 1, 'amod': 1, 'compound': 3, 'pobj': ...",{'.': 1},16,0.6908,0.322,0.093,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
4,"(The, selection, on, the, menu, was, great, an...",{'great': 1.0},"{'NOUN': 3, 'ADJ': 1}","{'nsubj': 2, 'pobj': 1, 'acomp': 1}",{'.': 1},13,0.6249,0.272,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0


In [222]:
X = model_features.drop(['review', 'sentiment', 'words', 'pos', 'punct', 'dependencies', 'website'], axis=1)
Y = model_features['sentiment']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=0)

In [223]:
lr = LogisticRegression(penalty='l2')

lr.fit(X_train, Y_train)

print('train set score: {}'.format(lr.score(X_train, Y_train)))
print('test set score: {}'.format(lr.score(X_test, Y_test)))

train set score: 0.8466787989080983
test set score: 0.8327272727272728


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [224]:
nb = BernoulliNB()

nb.fit(X_train, Y_train)

print('train set score: {}'.format(nb.score(X_train, Y_train)))
print('test set score: {}'.format(nb.score(X_test, Y_test)))

train set score: 0.8303002729754322
test set score: 0.8272727272727273


In [225]:
lr = LogisticRegression()

lr.fit(X_train, Y_train)

print('train set score: {}'.format(lr.score(X_train, Y_train)))
print('test set score: {}'.format(lr.score(X_test, Y_test)))

train set score: 0.8466787989080983
test set score: 0.8327272727272728


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# baseline model

In [227]:
from sklearn.dummy import DummyClassifier

dc = DummyClassifier(strategy="stratified")

dc.fit(X_train, Y_train)

print('train set score: {}'.format(dc.score(X_train, Y_train)))
print('test set score: {}'.format(dc.score(X_test, Y_test)))

train set score: 0.5118289353958144
test set score: 0.49636363636363634
