In [1]:
import pandas as pd
import codecs
from string import punctuation
from sklearn.naive_bayes import BernoulliNB
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_colwidth', -1)

# Sentiment analysis

**U2 L2 P7 - Emile Badran**

Perform a sentiment analysis, classifying whether feedback left on a website is either positive or negative.

The [dataset of sentiment labelled sentences](https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences) that is used here was created for the paper [From Group to Individual Labels using Deep Features, Kotzias et. al., KDD 2015](http://mdenil.com/media/papers/2015-deep-multi-instance-learning.pdf).

To increase sentiment analysis precision, a lexicon with 6687 positive and negative words from the University of Illinois at Chicago's College of Engineering was used. The lexicon files [are available online](http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html) and were created for the papers:

>   Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." 
>       Proceedings of the ACM SIGKDD International Conference on Knowledge 
>       Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, 
>       Washington, USA, 

>   Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing 
>       and Comparing Opinions on the Web." Proceedings of the 14th 
>       International World Wide Web conference (WWW-2005), May 10-14, 
>       2005, Chiba, Japan.

In [2]:
# Read the imdb sentiment labelled data set:
df = pd.read_table('sentiment_labelled/imdb_labelled.txt', encoding='UTF-8', header=None, quoting=3)
df.columns = ['review','target']

# Score is either 1 (for positive) or 0 (for negative).
# Convert the target column values to boolean objects:
df['target'] = (df['target'] == 1)

# Convert all words in the reviews column to lower case:
df.review = df.review.str.lower()

# declare a function that will strip all punctuation marks from reviews:
def f_punct(s):
    return ''.join(c for c in s if c not in punctuation)

# declare a function that will create a column with the set of words from each review:
def f_set(string):
    set(string.split())

# strip all puctuation marks:
df['review'] = df['review'].apply(lambda x: f_punct(x))

# generate sets from reviews:
df['sets'] = df['review'].apply(lambda x: set(x.split()))

# reorder dataframe columns:
df = df[['review', 'sets', 'target']]

df.head(n=3)

Unnamed: 0,review,sets,target
0,a very very very slowmoving aimless movie about a distressed drifting young man,"{distressed, aimless, movie, about, drifting, a, very, young, man, slowmoving}",False
1,not sure who was more lost the flat characters or the audience nearly half of whom walked out,"{flat, walked, sure, out, not, audience, half, was, of, more, the, lost, characters, whom, or, who, nearly}",False
2,attempting artiness with black white and clever camera angles the movie disappointed became even more ridiculous as the acting was poor and the plot and lines almost nonexistent,"{white, and, attempting, acting, black, artiness, nonexistent, clever, camera, became, with, was, the, plot, movie, poor, angles, ridiculous, almost, as, disappointed, even, lines, more}",False


In [3]:
# load positive lexicon with utf-8 encoding:
f = codecs.open('sentiment_labelled/positive-words.txt', encoding='utf-8')
positive_vocab = f.read().splitlines()

# load negative lexicon with utf-8 encoding:
g = codecs.open('sentiment_labelled/negative-words.txt', encoding='utf-8')
negative_vocab = g.read().splitlines()

# create a list of words to easily test the model:
words = ['very','a','lost','whom','camera','ridiculous']

# declare a function that returns true when a word is in a set:
def in_set(x, word):
    for i in x:
        if i == word:
            return True

# iterate the "words" list to test the model: create a column for every word
# in the "words" list, then apply the in_set function for every cell in 
# the "sets" column. 
for word in words:
    df[word] = df['sets'].apply(lambda x: in_set(x, word))

# iterate and create columns for every word in the positive and negative lexicons;
# apply the in_set function to every cell and return true when a word is in the set.
for word in negative_vocab:
    df[word] = df['sets'].apply(lambda x: in_set(x, word))

for word in positive_vocab:
    df[word] = df['sets'].apply(lambda x: in_set(x, word))

# fill none (empty) cells with "False"
df = df.fillna(False)

df.head(n=3)

Unnamed: 0,review,sets,target,very,a,lost,whom,camera,ridiculous,2-faced,...,wow,wowed,wowing,wows,yay,youthful,zeal,zenith,zest,zippy
0,a very very very slowmoving aimless movie about a distressed drifting young man,"{distressed, aimless, movie, about, drifting, a, very, young, man, slowmoving}",False,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,not sure who was more lost the flat characters or the audience nearly half of whom walked out,"{flat, walked, sure, out, not, audience, half, was, of, more, the, lost, characters, whom, or, who, nearly}",False,False,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,attempting artiness with black white and clever camera angles the movie disappointed became even more ridiculous as the acting was poor and the plot and lines almost nonexistent,"{white, and, attempting, acting, black, artiness, nonexistent, clever, camera, became, with, was, the, plot, movie, poor, angles, ridiculous, almost, as, disappointed, even, lines, more}",False,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False


In [4]:
# create a dataframe with only the boolean columns, and a variable with the target column:
data = df.iloc[:,2:]
target = df.target

# Instantiate the model and store it in a new variable.
bnb = BernoulliNB()

# Fit the model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display the results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 0


# Testing the model on one of the other datasets

In [5]:
# Read the Yelp sentiment labelled data set:
df2 = pd.read_table('sentiment_labelled/yelp_labelled.txt', encoding='UTF-8', header=None, quoting=3)

df2.columns = ['review','target']
df2['target'] = (df2['target'] == 1)
df2.review = df2.review.str.lower()

df2['review'] = df2['review'].apply(lambda x: f_punct(x))
df2['sets'] = df2['review'].apply(lambda x: set(x.split()))

df2 = df2[['review', 'sets', 'target']]

for word in negative_vocab:
    df2[word] = df2['sets'].apply(lambda x: in_set(x, word))

for word in positive_vocab:
    df2[word] = df2['sets'].apply(lambda x: in_set(x, word))

df2 = df2.fillna(False)

df2.head(n=3)

Unnamed: 0,review,sets,target,2-faced,2-faces,abnormal,abolish,abominable,abominably,abominate,...,wow,wowed,wowing,wows,yay,youthful,zeal,zenith,zest,zippy
0,wow loved this place,"{place, loved, wow, this}",True,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
1,crust is not good,"{crust, good, is, not}",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,not tasty and the texture was just nasty,"{just, and, not, was, nasty, the, tasty, texture}",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
data2 = df2.iloc[:,2:]
target2 = df2.target

# Instantiate the model and store it in a new variable.
bnb2 = BernoulliNB()

# Fit the model to the data.
bnb2.fit(data2, target2)

# Classify, storing the result in a new variable.
y_pred2 = bnb2.predict(data2)

# Display the results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data2.shape[0],
    (target2 != y_pred2).sum()
))

Number of mislabeled points out of a total 1000 points : 0
