## Sentiment Analysis Challenge
#### Naive Bayes Implementation

Imports.

In [81]:
import csv
import math

Set input variables.

In [82]:
train_file = 'data/train.tsv'
validation_percent = 0.05
test_file = 'data/train.tsv'
using_word_pairs = False

Import the data from provided file, only including valid rows with both labels and data.

In [83]:
labels = []
inputs = []
with open(train_file, encoding='utf-8') as data:
  reader = csv.reader(data, delimiter='\t')
  for row in reader:
    if len(row) == 2:
        labels.append(row[0])
        inputs.append(row[1])

In [84]:
if len(test_file) > 0:
    with open(test_file, encoding='utf-8') as data:
      reader = csv.reader(data, delimiter='\t')
      test_labels = []
      test_inputs = []
      for row in reader:
        if len(row) == 2:
            test_labels.append(row[0])
            test_inputs.append(row[1])
else:
    num_train_samples = int(len(labels) * (1-validation_percent))
    train_labels = labels[:num_train_samples]
    train_inputs = inputs[:num_train_samples]
    test_labels = labels[num_train_samples:]
    test_inputs = inputs[num_train_samples:]

Tested punctuation removal and lowercase normalizing but both performed significantly (at least 3%) worse on validation data.


In [85]:
def count_words(documents_list):
    _counts = {'positive': {}, 'negative': {}}
    for _idx, document in enumerate(documents_list):
            _sentiment = 'positive' if int(labels[_idx]) else 'negative'
            for word in document.split():
                if word in _counts[_sentiment]:
                    _counts[_sentiment][word] += 1
                else:
                    _counts[_sentiment][word] = 1
    return _counts

In [86]:
def count_word_pairs(documents_list):
    _counts = {'positive': {}, 'negative': {}}
    for _idx, document in enumerate(documents_list):
            _sentiment = 'positive' if int(labels[_idx]) else 'negative'
            _word_list = document.split()
            for word_place, word in enumerate(_word_list):
                if word_place < len(_word_list)-1:
                    word_pair = word + ' ' + _word_list[word_place+1]
                    if word_pair in _counts[_sentiment]:
                        _counts[_sentiment][word_pair] += 1
                    else:
                        _counts[_sentiment][word_pair] = 1
    return _counts

In [87]:
def fit_words(_inputs):
    _probabilities = count_words(_inputs)
    num_positive = len(_probabilities['positive'])
    num_negative = len(_probabilities['negative'])
    for pos_word in _probabilities['positive']:
        _probabilities['positive'][pos_word] = math.log(_probabilities['positive'][pos_word] / num_positive)
    for neg_word in _probabilities['negative']:
        _probabilities['negative'][neg_word] = math.log(_probabilities['negative'][neg_word] / num_negative)
    return _probabilities

def fit_pairs(_inputs):
    _pair_probabilities = count_word_pairs(_inputs)
    num_positive = len(_pair_probabilities['positive'])
    num_negative = len(_pair_probabilities['negative'])
    for pos_pair in _pair_probabilities['positive']:
        _pair_probabilities['positive'][pos_pair] = math.log(_pair_probabilities['positive'][pos_pair] / num_positive)
    for neg_pair in _pair_probabilities['negative']:
        _pair_probabilities['negative'][neg_pair] = math.log(_pair_probabilities['negative'][neg_pair] / num_negative)
    return _pair_probabilities

def fit(_inputs):
    return fit_words(_inputs), fit_pairs(_inputs)

word_probabilities, pair_probabilities = fit(train_inputs)

In [88]:
def get_positive_word_probability(_word):
    if _word in word_probabilities['positive']:
        return word_probabilities['positive'][_word]
    else:
        return math.log(1 / len(word_probabilities['positive']))
def get_negative_word_probability(_word):
    if _word in word_probabilities['negative']:
        return word_probabilities['negative'][_word]
    else:
        return math.log(1 / len(word_probabilities['negative']))
def get_positive_pair_probability(_pair):
    if _pair in pair_probabilities['positive']:
        return pair_probabilities['positive'][_pair]
    else:
        return math.log(1 / len(word_probabilities['positive']))
def get_negative_pair_probability(_pair):
    if _pair in pair_probabilities['negative']:
        return pair_probabilities['negative'][_pair]
    else:
        return math.log(1 / len(pair_probabilities['negative']))

In [89]:
def predict(_document):
    positive_sum = 0
    negative_sum = 0
    _word_list = _document.split()
    for _idx, _word in enumerate(_word_list):
        positive_sum += get_positive_word_probability(_word)
        negative_sum += get_negative_word_probability(_word)
        if using_word_pairs and _idx < len(_word_list)-1:
            positive_sum += get_positive_pair_probability(_word + ' ' + _word_list[_idx+1])
            negative_sum += get_negative_pair_probability(_word + ' ' + _word_list[_idx+1])
    return 1 if positive_sum > negative_sum else 0

In [90]:
correct = 0
for idx, doc in enumerate(test_inputs):
    if predict(doc) == int(test_labels[idx]):
        correct += 1
print("Validation Accuracy:", correct / len(test_inputs))

Validation Accuracy: 0.8046636474659068
