In [1]:
import pandas as pd
import codecs
from string import punctuation

from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_colwidth', -1)
import warnings
warnings.filterwarnings('ignore')



# Evaluating Sentiment analysis

**U2 L3 P4 - Emile Badran**

Perform a sentiment analysis, classifying whether feedback left on a website is either positive or negative.

The [dataset of sentiment labelled sentences](https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences) that is used here was created for the paper [From Group to Individual Labels using Deep Features, Kotzias et. al., KDD 2015](http://mdenil.com/media/papers/2015-deep-multi-instance-learning.pdf).

To increase sentiment analysis precision, a lexicon with 6687 positive and negative words from the University of Illinois at Chicago's College of Engineering was used. The lexicon files [are available online](http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html) and were created for the papers:

>   Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." 
>       Proceedings of the ACM SIGKDD International Conference on Knowledge 
>       Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, 
>       Washington, USA, 

>   Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing 
>       and Comparing Opinions on the Web." Proceedings of the 14th 
>       International World Wide Web conference (WWW-2005), May 10-14, 
>       2005, Chiba, Japan.

## Loading and preparing the data set:

In [2]:
# load positive lexicon with utf-8 encoding:
f = codecs.open('sentiment_labelled/positive-words.txt', encoding='utf-8')
positive_vocab = f.read().splitlines()

# load negative lexicon with utf-8 encoding:
g = codecs.open('sentiment_labelled/negative-words.txt', encoding='utf-8')
negative_vocab = g.read().splitlines()

# Read the imdb sentiment labelled data set:
df = pd.read_table('sentiment_labelled/imdb_labelled.txt', encoding='UTF-8', header=None, quoting=3)
df.columns = ['review','target']

# Score is either 1 (for positive) or 0 (for negative).
# Convert the target column values to boolean objects:
df['target'] = (df['target'] == 1)

# Convert all words in the reviews column to lower case:
df.review = df.review.str.lower()

# declare a function that will strip all punctuation marks from reviews:
def f_punct(s):
    return ''.join(c for c in s if c not in punctuation)

# strip all puctuation marks:
df['review'] = df['review'].apply(lambda x: f_punct(x))

# declare a function that will create a column with the set of words from each review:
def f_set(string):
    set(string.split())

# generate sets from reviews:
df['sets'] = df['review'].apply(lambda x: set(x.split()))

# reorder dataframe columns:
df = df[['review', 'sets', 'target']]

df.head(n=3)

Unnamed: 0,review,sets,target
0,a very very very slowmoving aimless movie about a distressed drifting young man,"{drifting, young, slowmoving, about, aimless, distressed, man, very, a, movie}",False
1,not sure who was more lost the flat characters or the audience nearly half of whom walked out,"{lost, the, out, was, flat, nearly, sure, more, walked, characters, who, not, half, of, or, audience, whom}",False
2,attempting artiness with black white and clever camera angles the movie disappointed became even more ridiculous as the acting was poor and the plot and lines almost nonexistent,"{camera, almost, poor, angles, nonexistent, was, disappointed, ridiculous, acting, and, as, lines, movie, the, even, with, artiness, attempting, plot, clever, more, became, white, black}",False


## Testing the column generation method:

In [3]:
# declare a function that returns true when a word is in a set:
def in_set(x, word):
    for i in x:
        if i == word:
            return True

# create a list of words contained in the first three reviews to easily test the model:
words = ['very', 'drifting', 'lost','whom','camera','ridiculous']

df_test = df[['sets']]

# iterate the "words" list to test the model: create a column for every word
# in the "words" list, then apply the in_set function for every cell in 
# the "sets" column. 
for word in words:
    df_test[word] = df_test['sets'].apply(lambda x: in_set(x, word))

df_test.head(n=3)

Unnamed: 0,sets,very,drifting,lost,whom,camera,ridiculous
0,"{drifting, young, slowmoving, about, aimless, distressed, man, very, a, movie}",True,True,,,,
1,"{lost, the, out, was, flat, nearly, sure, more, walked, characters, who, not, half, of, or, audience, whom}",,,True,True,,
2,"{camera, almost, poor, angles, nonexistent, was, disappointed, ridiculous, acting, and, as, lines, movie, the, even, with, artiness, attempting, plot, clever, more, became, white, black}",,,,,True,True


## Finding the most common positive and negative words:

In [4]:
df_positive = df[['sets']]

# iterate and create columns for every word in the lexicon;
# apply the in_set function to every cell and return true when a word is in the set.
for word in positive_vocab:
    df_positive[word] = df_positive['sets'].apply(lambda x: in_set(x, word))

positive_count = df_positive.iloc[:,1:].sum()
positive_count = positive_count.sort_values(ascending=False)
top_positive = list(positive_count[:4].index)

In [5]:
df_negative = df[['sets']]

# iterate and create columns for every word in the lexicon;
# apply the in_set function to every cell and return true when a word is in the set.
for word in negative_vocab:
    df_negative[word] = df_negative['sets'].apply(lambda x: in_set(x, word))
    
negative_count = df_negative.iloc[:,1:].sum()
negative_count = negative_count.sort_values(ascending=False)
top_negative = list(negative_count[:6].index)

In [6]:
# remove positive or neutral words from the negative list:
top_negative.remove('funny')
top_negative.remove('plot')

In [7]:
# creating the full dataset for training:
for word in top_positive:
    df[word] = df['sets'].apply(lambda x: in_set(x, word))

for word in top_negative:
    df[word] = df['sets'].apply(lambda x: in_set(x, word))

# fill none (empty) cells with "False"
df = df.fillna(False)

## The resulting DataFrame has only eight features:

The features consist of the top 4 positive words and the top 4 negative words.

In [8]:
df.head(n=2)

Unnamed: 0,review,sets,target,good,like,great,well,bad,stupid,waste,awful
0,a very very very slowmoving aimless movie about a distressed drifting young man,"{drifting, young, slowmoving, about, aimless, distressed, man, very, a, movie}",False,False,False,False,False,False,False,False,False
1,not sure who was more lost the flat characters or the audience nearly half of whom walked out,"{lost, the, out, was, flat, nearly, sure, more, walked, characters, who, not, half, of, or, audience, whom}",False,False,False,False,False,False,False,False,False


In [9]:
# create a dataframe with only the boolean columns, and a variable with the target column:
data = df.iloc[:,2:]
target = df.target

# Instantiate the model and store it in a new variable.
bnb = BernoulliNB()

# Fit the model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display the results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 0


# Testing the model on the Yelp dataset

In [10]:
# Read the Yelp sentiment labelled data set:
df2 = pd.read_table('sentiment_labelled/yelp_labelled.txt', encoding='UTF-8', header=None, quoting=3)

df2.columns = ['review','target']
df2['target'] = (df2['target'] == 1)
df2.review = df2.review.str.lower()

df2['review'] = df2['review'].apply(lambda x: f_punct(x))
df2['sets'] = df2['review'].apply(lambda x: set(x.split()))

df2 = df2[['review', 'sets', 'target']]

for word in top_positive:
    df2[word] = df2['sets'].apply(lambda x: in_set(x, word))

for word in top_negative:
    df2[word] = df2['sets'].apply(lambda x: in_set(x, word))

df2 = df2.fillna(False)

df2.head(n=3)

Unnamed: 0,review,sets,target,good,like,great,well,bad,stupid,waste,awful
0,wow loved this place,"{place, this, loved, wow}",True,False,False,False,False,False,False,False,False
1,crust is not good,"{good, crust, not, is}",False,True,False,False,False,False,False,False,False
2,not tasty and the texture was just nasty,"{the, was, texture, nasty, tasty, and, just, not}",False,False,False,False,False,False,False,False,False


In [11]:
data2 = df2.iloc[:,2:]
target2 = df2.target

# Instantiate the model and store it in a new variable.
bnb2 = BernoulliNB()

# Fit the model to the data.
bnb2.fit(data2, target2)

# Classify, storing the result in a new variable.
y_pred2 = bnb2.predict(data2)

# Display the results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data2.shape[0],
    (target2 != y_pred2).sum()
))

Number of mislabeled points out of a total 1000 points : 0


# Confusion matrix, holdout and cross-validation:

In [12]:
# Confusion matrix of the first data set (IMDB):
confusion_matrix(target, y_pred)

array([[500,   0],
       [  0, 500]])

In [13]:
# Test the model with different holdout groups with sklearn:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)

print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 1.0
Testing on Sample: 1.0


In [14]:
# Cross validating with the sklearn:
cross_val_score(bnb, data, target, cv=10)

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])

# Customized cross-validation methods:
### Ten-fold cross-validation (entire data set):

In [15]:
result_df = target.to_frame(name='target')
result_df['y_pred'] = y_pred

start = 0
end = int(len(target)/10)
multi = 10

for i in range(10):
    sample = result_df[start:end]    

    t_negatives = len(sample[sample.target == False][sample.y_pred == False])
    f_negatives = len(sample[sample.target == True][sample.y_pred == False])
    t_positives = len(sample[sample.target == True][sample.y_pred == True])
    f_positives = len(sample[sample.target == False][sample.y_pred == True])

    acc = (sample.target == sample.y_pred).sum() / len(sample)
    sens = (t_positives / (t_positives + f_negatives))
    spec = (t_negatives / (t_negatives + f_positives))
    

    print('\nRound', i, '\naccuracy:', acc, '\nsensitivity:', sens,
          '\nspecificity:',  spec, '\nconfusion mat:\n', confusion_matrix(sample.target, sample.y_pred))
    
    start = end
    end = end+(int(len(target)/10))


Round 0 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[39  0]
 [ 0 61]]

Round 1 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[79  0]
 [ 0 21]]

Round 2 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[55  0]
 [ 0 45]]

Round 3 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[40  0]
 [ 0 60]]

Round 4 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[56  0]
 [ 0 44]]

Round 5 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[73  0]
 [ 0 27]]

Round 6 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[32  0]
 [ 0 68]]

Round 7 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[40  0]
 [ 0 60]]

Round 8 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[53  0]
 [ 0 47]]

Round 9 
accuracy: 1.0 
sensitivity: 1.0 
specificity: 1.0 
confusion mat:
 [[33  0]
 [ 0 67]]


### Running/testing the model increasing sample size by 10x:

In [16]:
end = int(len(df.target)/10)

for i in range(10):
        
    data = df.iloc[0:end, 2:]
    target = df.target[0:end]

    bnb = BernoulliNB()
    bnb.fit(data, target)
    y_pred = bnb.predict(data)

    result_df = target.to_frame(name='target')
    result_df['y_pred'] = y_pred

    t_negatives = len(result_df[result_df.target == False][result_df.y_pred == False])
    f_negatives = len(result_df[result_df.target == True][result_df.y_pred == False])
    t_positives = len(result_df[result_df.target == True][result_df.y_pred == True])
    f_positives = len(result_df[result_df.target == False][result_df.y_pred == True])

    acc = (result_df.target == result_df.y_pred).sum() / len(result_df)
#    sens = (t_positives / (t_positives + f_negatives))
#    spec = (t_negatives / (t_negatives + f_positives))
    

    print('\nRound', i, 'Sample size =', len(target), '\naccuracy:', acc, '\nconfusion mat:\n',
          confusion_matrix(result_df.target, result_df.y_pred))
    
    end = end+(int(len(df.target)/10))


Round 0 Sample size = 100 
accuracy: 1.0 
confusion mat:
 [[39  0]
 [ 0 61]]

Round 1 Sample size = 200 
accuracy: 1.0 
confusion mat:
 [[118   0]
 [  0  82]]

Round 2 Sample size = 300 
accuracy: 1.0 
confusion mat:
 [[173   0]
 [  0 127]]

Round 3 Sample size = 400 
accuracy: 1.0 
confusion mat:
 [[213   0]
 [  0 187]]

Round 4 Sample size = 500 
accuracy: 1.0 
confusion mat:
 [[269   0]
 [  0 231]]

Round 5 Sample size = 600 
accuracy: 1.0 
confusion mat:
 [[342   0]
 [  0 258]]

Round 6 Sample size = 700 
accuracy: 1.0 
confusion mat:
 [[374   0]
 [  0 326]]

Round 7 Sample size = 800 
accuracy: 1.0 
confusion mat:
 [[414   0]
 [  0 386]]

Round 8 Sample size = 900 
accuracy: 1.0 
confusion mat:
 [[467   0]
 [  0 433]]

Round 9 Sample size = 1000 
accuracy: 1.0 
confusion mat:
 [[500   0]
 [  0 500]]


### Running/testing the model with random samples of increasing sizes:

In [17]:
data = df.iloc[0:end, 2:]
target = df.target[0:end]

sample_size = 100

for i in range(10):
    
    s_data = data.sample(sample_size)
    s_target = s_data.target
    
    bnb = BernoulliNB()
    bnb.fit(s_data, s_target)
    y_pred = bnb.predict(s_data)

    result_df = s_target.to_frame(name='s_target')
    result_df['y_pred'] = y_pred

    t_negatives = len(result_df[result_df.s_target == False][result_df.y_pred == False])
    f_negatives = len(result_df[result_df.s_target == True][result_df.y_pred == False])
    t_positives = len(result_df[result_df.s_target == True][result_df.y_pred == True])
    f_positives = len(result_df[result_df.s_target == False][result_df.y_pred == True])

    acc = (result_df.s_target == result_df.y_pred).sum() / len(result_df)

    print('\nRound', i, 'Sample size =', len(s_target), '\naccuracy:', acc, '\nconfusion mat:\n',
          confusion_matrix(result_df.s_target, result_df.y_pred))
    
    sample_size += 100


Round 0 Sample size = 100 
accuracy: 1.0 
confusion mat:
 [[52  0]
 [ 0 48]]

Round 1 Sample size = 200 
accuracy: 1.0 
confusion mat:
 [[ 83   0]
 [  0 117]]

Round 2 Sample size = 300 
accuracy: 1.0 
confusion mat:
 [[148   0]
 [  0 152]]

Round 3 Sample size = 400 
accuracy: 1.0 
confusion mat:
 [[198   0]
 [  0 202]]

Round 4 Sample size = 500 
accuracy: 1.0 
confusion mat:
 [[242   0]
 [  0 258]]

Round 5 Sample size = 600 
accuracy: 1.0 
confusion mat:
 [[295   0]
 [  0 305]]

Round 6 Sample size = 700 
accuracy: 1.0 
confusion mat:
 [[348   0]
 [  0 352]]

Round 7 Sample size = 800 
accuracy: 1.0 
confusion mat:
 [[416   0]
 [  0 384]]

Round 8 Sample size = 900 
accuracy: 1.0 
confusion mat:
 [[446   0]
 [  0 454]]

Round 9 Sample size = 1000 
accuracy: 1.0 
confusion mat:
 [[500   0]
 [  0 500]]
