# Naive Bayes

## Load imports.

In [7]:
# Make common scripts visible
import sys
sys.path.append('../common/')

import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

from tweet_parser import load_data, create_topic_hashtags_dict, cleanup_tweets
from sentence_utils import remove_stop_words_and_lemmatize
from function_executor import apply_fn_to_list_items_in_dict
from conversion import convert_dictionary_to_array
from classification import run_bernoulli_naive_bayes

## Useful lookup tables.

In [11]:
topic_code_to_int = {
    'brexit': 0,
    'gaza': 1,
    'fake_news': 2,
    'hurricane_harvey': 3,
    'winter_olympics': 4,
    'climate_change': 5
}

int_to_topic_code = {
    0: 'brexit',
    1: 'gaza',
    2: 'fake_news',
    3: 'hurricane_harvey',
    4: 'winter_olympics',
    5: 'climate_change'
}

## Load the train and test data.

In [3]:
tweets_keyed_by_topic = load_data(80000, 'data/')
topic_hashtags_dict = create_topic_hashtags_dict('hashtag_list/')
tweets_keyed_by_topic_cleaned = cleanup_tweets(tweets_keyed_by_topic, topic_hashtags_dict)
tweets_keyed_by_topic_cleaned = apply_fn_to_list_items_in_dict(tweets_keyed_by_topic_cleaned,
                                                               remove_stop_words_and_lemmatize,
                                                               lowercase = True,
                                                               lemmatize = True)

In [4]:
np.random.seed(42)

# Split data into 80% train, 20% test
x, y = convert_dictionary_to_array(tweets_keyed_by_topic_cleaned, topic_code_to_int)
total_examples = len(y)
split_point = int(total_examples * 0.8)
train_x = x[:split_point]
train_y = y[:split_point]
test_x = x[split_point:]
test_y = y[split_point:]

## Assess Bernoulli Naive Bayes baseline classification performance.

Run Bernoulli Naive Bayes and report classification accuracy.

In [8]:
predict_y = run_bernoulli_naive_bayes(train_x,
                                      train_y,
                                      test_x,
                                      test_y, 
                                      ngram_range = (1, 1))
print(classification_report(test_y, predict_y, digits=6, target_names=topic_code_to_int.keys()))
print(confusion_matrix(test_y, predict_y))

                  precision    recall  f1-score   support

          brexit   0.883776  0.898173  0.890916     16145
            gaza   0.971706  0.860439  0.912694     16165
       fake_news   0.715100  0.830023  0.768288     15861
hurricane_harvey   0.838611  0.874945  0.856393     16041
 winter_olympics   0.830615  0.876676  0.853024     16031
  climate_change   0.876854  0.735229  0.799820     15757

       micro avg   0.846344  0.846344  0.846344     96000
       macro avg   0.852777  0.845914  0.846856     96000
    weighted avg   0.853154  0.846344  0.847275     96000

[[14501    69   721   263   296   295]
 [  281 13909   985   356   360   274]
 [  651   163 13165   591   755   536]
 [  139    25   910 14035   660   272]
 [  187    48   792   700 14054   250]
 [  649   100  1837   791   795 11585]]


## Find examples where predictions went wrong

In [26]:
for topic_code, index in topic_code_to_int.items():
    topic_subset = predict_y[test_y == index]
    topic_subset_incorrect = topic_subset[topic_subset != index]
    document_subset = test_x[test_y == index]
    document_subset = document_subset[topic_subset != index]
    
    print('------ 5 random erroneous predictions for {} ------'.format(topic_code))
    print('')
    random_indices = np.random.choice(np.arange(len(topic_subset_incorrect)), 5)
    for index in random_indices:
        print(document_subset[index])
        print('')
        print('Above classified as {}'.format(int_to_topic_code[topic_subset_incorrect[index]]))
        print('--')
    print('')

------ 5 random erroneous predictions for brexit ------

william dalrymple seem damn american good rest thing know

Above classified as fake_news
--
watch space wise kick road let see

Above classified as winter_olympics
--
surprise play safe

Above classified as hurricane_harvey
--
good news

Above classified as fake_news
--


Above classified as hurricane_harvey
--

------ 5 random erroneous predictions for gaza ------

super compelling read

Above classified as winter_olympics
--
thought prayer go family victim lose life today

Above classified as hurricane_harvey
--
wish hopefull morning friend 🙂 childhood child hope ..

Above classified as hurricane_harvey
--
breaking report death

Above classified as fake_news
--
eye-witness

Above classified as hurricane_harvey
--

------ 5 random erroneous predictions for fake_news ------

free £2.00 walk bet get owen smith alan moran fbpe abtv political scrapbook

Above classified as brexit
--
russian embassy uk best teenage social medium mana