# Sentiment Analysis Example

## Imports

In [1]:
import re
import string

## Init

In [2]:
stopwords = ["this", "that", "than", "a", "the", "i", "im", "ive", "is", "was", "for"]
all_comments = [["This movie is the best.", #Positive
             "The best movie I've ever seen!",
             "betteR than the rest.",
             "Better tHan the first version.",
             "I'm just here for the comments. :)"],
            ["I HATE THIS MOVIE. aaaarrrg!", #Negative
             "What's this garbage!",
             "This was worse than I thought",
             "Worse movie ever produced.",
             "A terrible movie!",]]

## Pre-Process

In [3]:
clean_comments = []
features = []

for comments in all_comments:
    temp_= []
    for comment in comments:
        cleaned_tokens = []
        tokens = comment.split()
        for item in tokens:
            item = item.lower()
            item = re.sub('[^a-zA-Z]', '', item)
            if item not in stopwords and len(item) > 1:
                cleaned_tokens.append(item)
                features.append(item)
        temp_.append(cleaned_tokens)
    clean_comments.append(temp_)
print(clean_comments, "\n\nNumber of Features:", len(features))

[[['movie', 'best'], ['best', 'movie', 'ever', 'seen'], ['better', 'rest'], ['better', 'first', 'version'], ['just', 'here', 'comments']], [['hate', 'movie', 'aaaarrrg'], ['whats', 'garbage'], ['worse', 'thought'], ['worse', 'movie', 'ever', 'produced'], ['terrible', 'movie']]] 

Number of Features: 27


## Est Probability of Each Feature/Token (in each class)

In [4]:
p_comments = []
n_comments = []

for docs in clean_comments[0]:
    p_comments.extend(docs)
print(p_comments)
for docs in clean_comments[1]:
    n_comments.extend(docs)
print(n_comments, "\n")

p_features = []
n_features = []

#Positive class features
for f in features:
    count = p_comments.count(f)
    if count != 0:
        prob = count/(len(p_comments))
    else:
        prob = (count+1)/(len(p_comments) + len(features)*1)
    p_features.append(prob)
    #print(f, count, prob)

#Negative class features
for f in features:
    count = n_comments.count(f)
    if count != 0:
        prob = count/(len(n_comments))
    else:
        prob = (count+1)/(len(n_comments) + len(features)*1)
    n_features.append(prob)
    #print(f, count, prob)
    
print(p_features, "\n\n", n_features)

['movie', 'best', 'best', 'movie', 'ever', 'seen', 'better', 'rest', 'better', 'first', 'version', 'just', 'here', 'comments']
['hate', 'movie', 'aaaarrrg', 'whats', 'garbage', 'worse', 'thought', 'worse', 'movie', 'ever', 'produced', 'terrible', 'movie'] 

[0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.07142857142857142, 0.07142857142857142, 0.14285714285714285, 0.07142857142857142, 0.14285714285714285, 0.07142857142857142, 0.07142857142857142, 0.07142857142857142, 0.07142857142857142, 0.07142857142857142, 0.024390243902439025, 0.14285714285714285, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.14285714285714285, 0.07142857142857142, 0.024390243902439025, 0.024390243902439025, 0.14285714285714285] 

 [0.23076923076923078, 0.025, 0.025, 0.23076923076923078, 0.07692307692307693, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.07692307692307693, 

## Converting to Vectors

In [5]:
p_vector = []
n_vector = []

#Positives
for doc in clean_comments[0]:
    temp_ = []
    for token in features:
        if token in doc:
            temp_.append(1)
        else:
            temp_.append(0)
    p_vector.append(temp_)

#Negatives
for doc in clean_comments[1]:
    temp_ = []
    for token in features:
        if token in doc:
            temp_.append(1)
        else:
            temp_.append(0)
    n_vector.append(temp_)
print("Positive vectors:", p_vector, "\n\nNegative vectors:" ,n_vector)


Positive vectors: [[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] 

Negative vectors: [[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1], [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1]]


## Est Probability of Each Class

In [6]:
n_total_docs = len(p_vector) + len(n_vector)
print("Total documents:", n_total_docs)

Total documents: 10


In [7]:
p_probability = len(p_vector)/n_total_docs
n_probability = len(n_vector)/n_total_docs
print("Probability of each class:", p_probability, n_probability)

Probability of each class: 0.5 0.5


## Predict Training Data Class

In [8]:
training_vector = n_vector[3]
print(all_comments[1][3])

p_prob = 1
n_prob = 1

for i, item in enumerate(training_vector):
    if item == 1:
        p_prob = p_prob * p_features[i]
        n_prob = n_prob * n_features[i]
p_prob = p_prob * p_probability
n_prob = n_prob * n_probability

if p_prob > n_prob:
    print("Positive Comment")
else:
    print("Negative Comment")

Worse movie ever produced.
Negative Comment


## Predict New Data Class

In [9]:
test_comment = "Is not this movie the best of all?"
print(test_comment)

tokenized_comment = []
tokens = test_comment.split()
for item in tokens:
    item = item.lower()
    item = re.sub('[^a-zA-Z]', '', item)
    if item not in stopwords and len(item) > 1:
        tokenized_comment.append(item)

test_vector = []
for token in features:
    if token in tokenized_comment:
        test_vector.append(1)
    else:
        test_vector.append(0)

test_p_class = 1
test_n_class = 1

for i, item in enumerate(test_vector):
    if item == 1:
        test_p_class = test_p_class * p_features[i]
        test_n_class = test_n_class * n_features[i]
        
test_p_class = test_p_class * p_probability
test_n_class = test_n_class * n_probability


if test_p_class > test_n_class:
    print("Positive Comment")
else:
    print("Negative Comment")

Is not this movie the best of all?
Positive Comment
