# Naive Bayes

## Load imports.

In [1]:
# Make common scripts visible
import sys
sys.path.append('../common/')

import numpy as np

from tweet_parser import load_data, create_topic_hashtags_dict, cleanup_tweets
from sentence_utils import remove_stop_words_and_lemmatize
from function_executor import apply_fn_to_list_items_in_dict
from conversion import convert_dictionary_to_array
from classification import run_bernoulli_naive_bayes

## Useful lookup tables.

In [2]:
topic_code_to_int = {
    'brexit': 0,
    'gaza': 1,
    'fake_news': 2,
    'hurricane_harvey': 3,
    'winter_olympics': 4,
    'climate_change': 5
}

## Load the train and test data.

In [3]:
tweets_keyed_by_topic = load_data(100000, 'data/')
topic_hashtags_dict = create_topic_hashtags_dict('hashtag_list/')
tweets_keyed_by_topic_cleaned = cleanup_tweets(tweets_keyed_by_topic, topic_hashtags_dict)
tweets_keyed_by_topic_cleaned = apply_fn_to_list_items_in_dict(tweets_keyed_by_topic_cleaned,
                                                               remove_stop_words_and_lemmatize,
                                                               lowercase = True,
                                                               lemmatize = True)

In [4]:
np.random.seed(42)

# Split data into 80% train, 20% test
x, y = convert_dictionary_to_array(tweets_keyed_by_topic_cleaned, topic_code_to_int)
total_examples = len(y)
split_point = int(total_examples * 0.8)
train_x = x[:split_point]
train_y = y[:split_point]
test_x = x[split_point:]
test_y = y[split_point:]

## Assess Bernoulli Naive Bayes baseline classification performance.

Run Bernoulli Naive Bayes and report classification accuracy.

In [5]:
report = run_bernoulli_naive_bayes(train_x,
                                   train_y,
                                   test_x,
                                   test_y, 
                                   topic_code_to_int.keys(),
                                   ngram_range = (1, 1))
print(report)

                  precision    recall  f1-score   support

          brexit   0.881703  0.904729  0.893068     20027
            gaza   0.972225  0.864168  0.915017     19848
       fake_news   0.713230  0.839420  0.771197     20127
hurricane_harvey   0.842448  0.885880  0.863619     19979
 winter_olympics   0.853501  0.872286  0.862792     20037
  climate_change   0.880298  0.732759  0.799782     19982

       micro avg   0.849875  0.849875  0.849875    120000
       macro avg   0.857234  0.849874  0.850912    120000
    weighted avg   0.856940  0.849875  0.850765    120000

