# Naive Bayes

## Load imports.

In [1]:
# Make common scripts visible
import sys
sys.path.append('../common/')

import numpy as np

from tweet_parser import load_data, create_topic_hashtags_dict, cleanup_tweets
from sentence_utils import remove_stop_words_and_lemmatize
from function_executor import apply_fn_to_list_items_in_dict
from conversion import convert_dictionary_to_array
from classification import run_bernoulli_naive_bayes

## Useful lookup tables.

In [2]:
topic_code_to_int = {
    'brexit': 0,
    'gaza': 1,
    'fake_news': 2,
    'hurricane_harvey': 3,
    'winter_olympics': 4,
    'climate_change': 5
}

## Load the train and test data.

In [3]:
tweets_keyed_by_topic = load_data(100000, 'data/')
topic_hashtags_dict = create_topic_hashtags_dict('hashtag_list/')
tweets_keyed_by_topic_cleaned = cleanup_tweets(tweets_keyed_by_topic, topic_hashtags_dict)
tweets_keyed_by_topic_cleaned = apply_fn_to_list_items_in_dict(tweets_keyed_by_topic_cleaned,
                                                               remove_stop_words_and_lemmatize,
                                                               lowercase = True,
                                                               lemmatize = True)

KeyboardInterrupt: 

In [None]:
np.random.seed(42)

# Split data into 80% train, 20% test
x, y = convert_dictionary_to_array(tweets_keyed_by_topic_cleaned, topic_code_to_int)
total_examples = len(y)
split_point = int(total_examples * 0.8)
train_x = x[:split_point]
train_y = y[:split_point]
test_x = x[split_point:]
test_y = y[split_point:]

## Assess Bernoulli Naive Bayes baseline classification performance.

Run Bernoulli Naive Bayes and report classification accuracy.

In [None]:
report = run_bernoulli_naive_bayes(train_x,
                                   train_y,
                                   test_x,
                                   test_y, 
                                   topic_code_to_int.keys(),
                                   ngram_range = (1, 1))
print(report)