## Sentiment Analysis Challenge

Imports.

In [135]:
import csv
import math

Set input variables.

In [136]:
data_file = 'data/train.tsv'
validation_percent = 0.05

Import the data from provided file, only including valid rows with both labels and data.

In [137]:
labels = []
inputs = []
with open('data/train.tsv', encoding='utf-8') as data:
  reader = csv.reader(data, delimiter='\t')
  for row in reader:
    if len(row) == 2:
        labels.append(row[0])
        inputs.append(row[1])

In [138]:
num_train_samples = int(len(labels) * (1-validation_percent))
train_labels = labels[:num_train_samples]
train_inputs = inputs[:num_train_samples]
val_labels = labels[num_train_samples:]
val_inputs = inputs[num_train_samples:]

In [139]:
def count_words(documents_list):
    counts = {'positive': {}, 'negative': {}}
    for idx, document in enumerate(documents_list):
            if int(labels[idx]):
                for word in document.split():
                    if word in counts['positive']:
                        counts['positive'][word] += 1
                    else:
                        counts['positive'][word] = 1
            else:
                for word in document.split():
                    if word in counts['negative']:
                        counts['negative'][word] += 1
                    else:
                        counts['negative'][word] = 1
    return counts

In [140]:
def fit(_inputs):
    _probabilities = count_words(_inputs)
    num_positive = len(_probabilities['positive'])
    num_negative = len(_probabilities['negative'])
    for pos_word in _probabilities['positive']:
        _probabilities['positive'][pos_word] = math.log(_probabilities['positive'][pos_word] / num_positive)
    for neg_word in _probabilities['negative']:
        _probabilities['negative'][neg_word] = math.log(_probabilities['negative'][neg_word] / num_negative)
    return _probabilities
probabilities = fit(train_inputs)

In [141]:
def get_positive_probability(_word):
    if _word in probabilities['positive']:
        return probabilities['positive'][_word]
    else:
        return math.log(1 / len(probabilities['positive']))
def get_negative_probability(_word):
    if _word in probabilities['negative']:
        return probabilities['negative'][_word]
    else:
        return math.log(1 / len(probabilities['negative']))

In [142]:
def predict(_document):
    positive_sum = 0
    negative_sum = 0
    for _word in _document.split():
        positive_sum += get_positive_probability(_word)
        negative_sum += get_negative_probability(_word)
    return 1 if positive_sum > negative_sum else 0

In [143]:
correct = 0
print(predict('Dwi ddim yn hapus iawn heddiw'))
for idx, doc in enumerate(val_inputs):
    if predict(doc) == int(val_labels[idx]):
        correct += 1
print("Validation Accuracy:", correct / len(val_inputs))

1
Validation Accuracy: 0.7387433223098449
