In [1]:
import pandas as pd
import codecs
from string import punctuation
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_colwidth', -1)
import warnings
warnings.filterwarnings('ignore')

# Evaluating Sentiment analysis

**U2 L3 P4 - Emile Badran**

Perform a sentiment analysis, classifying whether feedback left on a website is either positive or negative.

The [dataset of sentiment labelled sentences](https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences) that is used here was created for the paper [From Group to Individual Labels using Deep Features, Kotzias et. al., KDD 2015](http://mdenil.com/media/papers/2015-deep-multi-instance-learning.pdf).

To increase sentiment analysis precision, a lexicon with 6687 positive and negative words from the University of Illinois at Chicago's College of Engineering was used. The lexicon files [are available online](http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html) and were created for the papers:

>   Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." 
>       Proceedings of the ACM SIGKDD International Conference on Knowledge 
>       Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, 
>       Washington, USA, 

>   Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing 
>       and Comparing Opinions on the Web." Proceedings of the 14th 
>       International World Wide Web conference (WWW-2005), May 10-14, 
>       2005, Chiba, Japan.

In [2]:
# Read the imdb sentiment labelled data set:
df = pd.read_table('sentiment_labelled/imdb_labelled.txt', encoding='UTF-8', header=None, quoting=3)
df.columns = ['review','target']

# Score is either 1 (for positive) or 0 (for negative).
# Convert the target column values to boolean objects:
df['target'] = (df['target'] == 1)

# Convert all words in the reviews column to lower case:
df.review = df.review.str.lower()

# declare a function that will strip all punctuation marks from reviews:
def f_punct(s):
    return ''.join(c for c in s if c not in punctuation)

# declare a function that will create a column with the set of words from each review:
def f_set(string):
    set(string.split())

# strip all puctuation marks:
df['review'] = df['review'].apply(lambda x: f_punct(x))

# generate sets from reviews:
df['sets'] = df['review'].apply(lambda x: set(x.split()))

# reorder dataframe columns:
df = df[['review', 'sets', 'target']]

df.head(n=3)

Unnamed: 0,review,sets,target
0,a very very very slowmoving aimless movie about a distressed drifting young man,"{about, distressed, drifting, very, movie, man, slowmoving, a, young, aimless}",False
1,not sure who was more lost the flat characters or the audience nearly half of whom walked out,"{not, who, the, flat, lost, of, or, half, more, audience, whom, nearly, sure, out, characters, walked, was}",False
2,attempting artiness with black white and clever camera angles the movie disappointed became even more ridiculous as the acting was poor and the plot and lines almost nonexistent,"{black, with, almost, angles, artiness, became, camera, the, movie, more, attempting, as, white, plot, and, poor, disappointed, lines, nonexistent, ridiculous, clever, acting, even, was}",False


In [3]:
# load positive lexicon with utf-8 encoding:
f = codecs.open('sentiment_labelled/positive-words.txt', encoding='utf-8')
positive_vocab = f.read().splitlines()

# load negative lexicon with utf-8 encoding:
g = codecs.open('sentiment_labelled/negative-words.txt', encoding='utf-8')
negative_vocab = g.read().splitlines()

# create a list of words to easily test the model:
words = ['very','a','lost','whom','camera','ridiculous']

# declare a function that returns true when a word is in a set:
def in_set(x, word):
    for i in x:
        if i == word:
            return True

# iterate the "words" list to test the model: create a column for every word
# in the "words" list, then apply the in_set function for every cell in 
# the "sets" column. 
for word in words:
    df[word] = df['sets'].apply(lambda x: in_set(x, word))

# iterate and create columns for every word in the positive and negative lexicons;
# apply the in_set function to every cell and return true when a word is in the set.
for word in negative_vocab:
    df[word] = df['sets'].apply(lambda x: in_set(x, word))

for word in positive_vocab:
    df[word] = df['sets'].apply(lambda x: in_set(x, word))

# fill none (empty) cells with "False"
df = df.fillna(False)

df.head(n=3)

Unnamed: 0,review,sets,target,very,a,lost,whom,camera,ridiculous,2-faced,...,wow,wowed,wowing,wows,yay,youthful,zeal,zenith,zest,zippy
0,a very very very slowmoving aimless movie about a distressed drifting young man,"{about, distressed, drifting, very, movie, man, slowmoving, a, young, aimless}",False,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,not sure who was more lost the flat characters or the audience nearly half of whom walked out,"{not, who, the, flat, lost, of, or, half, more, audience, whom, nearly, sure, out, characters, walked, was}",False,False,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,attempting artiness with black white and clever camera angles the movie disappointed became even more ridiculous as the acting was poor and the plot and lines almost nonexistent,"{black, with, almost, angles, artiness, became, camera, the, movie, more, attempting, as, white, plot, and, poor, disappointed, lines, nonexistent, ridiculous, clever, acting, even, was}",False,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False


In [4]:
# create a dataframe with only the boolean columns, and a variable with the target column:
data = df.iloc[:,2:]
target = df.target

# Instantiate the model and store it in a new variable.
bnb = BernoulliNB()

# Fit the model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display the results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 0


# Testing the model on one of the other datasets

In [5]:
# Read the Yelp sentiment labelled data set:
df2 = pd.read_table('sentiment_labelled/yelp_labelled.txt', encoding='UTF-8', header=None, quoting=3)

df2.columns = ['review','target']
df2['target'] = (df2['target'] == 1)
df2.review = df2.review.str.lower()

df2['review'] = df2['review'].apply(lambda x: f_punct(x))
df2['sets'] = df2['review'].apply(lambda x: set(x.split()))

df2 = df2[['review', 'sets', 'target']]

for word in negative_vocab:
    df2[word] = df2['sets'].apply(lambda x: in_set(x, word))

for word in positive_vocab:
    df2[word] = df2['sets'].apply(lambda x: in_set(x, word))

df2 = df2.fillna(False)

df2.head(n=3)

Unnamed: 0,review,sets,target,2-faced,2-faces,abnormal,abolish,abominable,abominably,abominate,...,wow,wowed,wowing,wows,yay,youthful,zeal,zenith,zest,zippy
0,wow loved this place,"{wow, loved, this, place}",True,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
1,crust is not good,"{good, is, not, crust}",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,not tasty and the texture was just nasty,"{not, texture, the, tasty, nasty, and, just, was}",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
data2 = df2.iloc[:,2:]
target2 = df2.target

# Instantiate the model and store it in a new variable.
bnb2 = BernoulliNB()

# Fit the model to the data.
bnb2.fit(data2, target2)

# Classify, storing the result in a new variable.
y_pred2 = bnb2.predict(data2)

# Display the results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data2.shape[0],
    (target2 != y_pred2).sum()
))

Number of mislabeled points out of a total 1000 points : 0


# Confusion matrix

In [7]:
# Confusion matrix of the first data set (IMDB):
confusion_matrix(target, y_pred)

array([[500,   0],
       [  0, 500]])

# Ten-fold cross-validation (entire data set):

In [8]:
result_df = target.to_frame(name='target')
result_df['y_pred'] = y_pred

start = 0
end = int(len(target)/10)
multi = 10

for i in range(10):
    sample = result_df[start:end]    

    t_negatives = len(sample[sample.target == False][sample.y_pred == False])
    f_negatives = len(sample[sample.target == True][sample.y_pred == False])
    t_positives = len(sample[sample.target == True][sample.y_pred == True])
    f_positives = len(sample[sample.target == False][sample.y_pred == True])

    acc = (sample.target == sample.y_pred).sum() / len(sample)
    sens = (t_positives / (t_positives + f_negatives))
    spec = (t_negatives / (t_negatives + f_positives))
    

    print('\nRound', i, '\naccuracy:', acc, '\nsensitivity:', sens,
          '\nspecificity:',  spec, '\nconfusion mat:\n', confusion_matrix(sample.target, sample.y_pred))
    
    start = end
    end = end+(int(len(target)/10))


Round 0 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[39  0]
 [ 0 61]]

Round 1 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[79  0]
 [ 0 21]]

Round 2 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[55  0]
 [ 0 45]]

Round 3 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[40  0]
 [ 0 60]]

Round 4 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[56  0]
 [ 0 44]]

Round 5 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[73  0]
 [ 0 27]]

Round 6 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[32  0]
 [ 0 68]]

Round 7 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[40  0]
 [ 0 60]]

Round 8 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[53  0]
 [ 0 47]]

Round 9 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[33  0]
 [ 0 67]]


# Running/testing the model increasing sample size by 10x:

In [9]:
end = int(len(df.target)/10)

for i in range(10):
        
    data = df.iloc[0:end, 2:]
    target = df.target[0:end]

    bnb = BernoulliNB()
    bnb.fit(data, target)
    y_pred = bnb.predict(data)

    result_df = target.to_frame(name='target')
    result_df['y_pred'] = y_pred

    t_negatives = len(result_df[result_df.target == False][result_df.y_pred == False])
    f_negatives = len(result_df[result_df.target == True][result_df.y_pred == False])
    t_positives = len(result_df[result_df.target == True][result_df.y_pred == True])
    f_positives = len(result_df[result_df.target == False][result_df.y_pred == True])

    acc = (result_df.target == result_df.y_pred).sum() / len(result_df)
#    sens = (t_positives / (t_positives + f_negatives))
#    spec = (t_negatives / (t_negatives + f_positives))
    

    print('\nRound', i, 'Sample size =', len(target), '\naccuracy:', acc, '\nconfusion mat:\n',
          confusion_matrix(result_df.target, result_df.y_pred))
    
    end = end+(int(len(df.target)/10))


Round 0 Sample size = 100 
accuracy: 0.61 
confusion mat:
 [[ 0 39]
 [ 0 61]]

Round 1 Sample size = 200 
accuracy: 0.59 
confusion mat:
 [[118   0]
 [ 82   0]]

Round 2 Sample size = 300 
accuracy: 0.576666666667 
confusion mat:
 [[173   0]
 [127   0]]

Round 3 Sample size = 400 
accuracy: 0.9975 
confusion mat:
 [[213   0]
 [  1 186]]

Round 4 Sample size = 500 
accuracy: 0.996 
confusion mat:
 [[269   0]
 [  2 229]]

Round 5 Sample size = 600 
accuracy: 0.893333333333 
confusion mat:
 [[342   0]
 [ 64 194]]

Round 6 Sample size = 700 
accuracy: 1.0 
confusion mat:
 [[374   0]
 [  0 326]]

Round 7 Sample size = 800 
accuracy: 1.0 
confusion mat:
 [[414   0]
 [  0 386]]

Round 8 Sample size = 900 
accuracy: 1.0 
confusion mat:
 [[467   0]
 [  0 433]]

Round 9 Sample size = 1000 
accuracy: 1.0 
confusion mat:
 [[500   0]
 [  0 500]]


# Running/testing the model with random samples of increasing sizes:

In [10]:
data = df.iloc[0:end, 2:]
target = df.target[0:end]

sample_size = 100

for i in range(10):
    
    s_data = data.sample(sample_size)
    s_target = target.sample(sample_size)
    
    bnb = BernoulliNB()
    bnb.fit(s_data, s_target)
    y_pred = bnb.predict(s_data)

    result_df = s_target.to_frame(name='s_target')
    result_df['y_pred'] = y_pred

    t_negatives = len(result_df[result_df.s_target == False][result_df.y_pred == False])
    f_negatives = len(result_df[result_df.s_target == True][result_df.y_pred == False])
    t_positives = len(result_df[result_df.s_target == True][result_df.y_pred == True])
    f_positives = len(result_df[result_df.s_target == False][result_df.y_pred == True])

    acc = (result_df.s_target == result_df.y_pred).sum() / len(result_df)

    print('\nRound', i, 'Sample size =', len(s_target), '\naccuracy:', acc, '\nconfusion mat:\n',
          confusion_matrix(result_df.s_target, result_df.y_pred))
    
    sample_size += 100


Round 0 Sample size = 100 
accuracy: 0.59 
confusion mat:
 [[ 0 41]
 [ 0 59]]

Round 1 Sample size = 200 
accuracy: 0.54 
confusion mat:
 [[108   0]
 [ 92   0]]

Round 2 Sample size = 300 
accuracy: 0.703333333333 
confusion mat:
 [[ 62  87]
 [  2 149]]

Round 3 Sample size = 400 
accuracy: 0.81 
confusion mat:
 [[138  61]
 [ 15 186]]

Round 4 Sample size = 500 
accuracy: 0.76 
confusion mat:
 [[231  21]
 [ 99 149]]

Round 5 Sample size = 600 
accuracy: 0.753333333333 
confusion mat:
 [[179 118]
 [ 30 273]]

Round 6 Sample size = 700 
accuracy: 0.778571428571 
confusion mat:
 [[286  64]
 [ 91 259]]

Round 7 Sample size = 800 
accuracy: 0.73625 
confusion mat:
 [[352  50]
 [161 237]]

Round 8 Sample size = 900 
accuracy: 0.741111111111 
confusion mat:
 [[412  42]
 [191 255]]

Round 9 Sample size = 1000 
accuracy: 0.741 
confusion mat:
 [[370 130]
 [129 371]]
