# Naive Bayes

## Load imports.

In [1]:
from collections import defaultdict
import numpy as np

# Make common scripts visible
import sys
sys.path.append('../common/')

from reuters_parser import load_data
from sentence_utils import remove_stop_words_and_lemmatize
from conversion import convert_dictionary_to_array
from classification import run_bernoulli_naive_bayes
from lookup_tables import topic_code_to_topic_dict, topic_code_to_int, int_to_topic_code

## Load the train and test data.

Load the articles.

In [2]:
def print_number_of_articles_per_topic(dataset, dataset_name):
    # Print out the number of documents in each category
    print('')
    print('------------------ {} ------------------'.format(dataset_name))
    print('')
    total_number = 0
    for topic_code, articles in dataset.items():
        print('Number of articles for topic {}: {}'.format(topic_code_to_topic_dict[topic_code], len(articles)))
        total_number += len(articles)
    print('')
    print('Total number of articles: {}'.format(total_number))

#year_data = load_data('19960820', '19970819', '../../../downloads/reuters/rcv1/', topic_code_to_topic_dict)
year_data = load_data('19960820', '19960830', '../../../downloads/reuters/rcv1/', topic_code_to_topic_dict)

print_number_of_articles_per_topic(year_data, 'Data for a Year August 96 to August 97')


------------------ Data for a Year August 96 to August 97 ------------------

Number of articles for topic CRIME, LAW ENFORCEMENT: 1037
Number of articles for topic SPORTS: 808
Number of articles for topic RELIGION: 69
Number of articles for topic ELECTIONS: 319
Number of articles for topic ECONOMIC PERFORMANCE: 272
Number of articles for topic HEALTH: 158

Total number of articles: 2663


Lemmatize and remove stopwords from each news article.

In [3]:
def sanitise_each_topic(dataset):
    """
    Removes stop words and lemmatizes all articles for each topic.
    """
    data_sanitised = defaultdict(list)
    
    for topic_code, articles in dataset.items():
        for article in articles:
            article_sanitised = remove_stop_words_and_lemmatize(article)
            data_sanitised[topic_code].append(article_sanitised)
    
    return data_sanitised


year_data_sanitised = sanitise_each_topic(year_data)

Convert dictionary to array.

In [4]:
np.random.seed(42)

# Split data into 80% train, 20% test
x, y = convert_dictionary_to_array(year_data_sanitised, topic_code_to_int)
total_examples = len(y)
split_point = int(total_examples * 0.8)
train_x = x[:split_point]
train_y = y[:split_point]
test_x = x[split_point:]
test_y = y[split_point:]

## Assess Bernoulli Naive Bayes baseline classification performance.

Run Bernoulli Naive Bayes and report classification accuracy.

In [5]:
report = run_bernoulli_naive_bayes(train_x,
                                   train_y,
                                   test_x,
                                   test_y, 
                                   topic_code_to_topic_dict.values(),
                                   ngram_range = (1, 1))
print(report)

                        precision    recall  f1-score   support

CRIME, LAW ENFORCEMENT   0.720149  0.979695  0.830108       197
  ECONOMIC PERFORMANCE   0.950000  0.666667  0.783505        57
             ELECTIONS   0.837209  0.679245  0.750000        53
                HEALTH   1.000000  0.150000  0.260870        40
              RELIGION   1.000000  0.181818  0.307692        11
                SPORTS   0.982759  0.977143  0.979943       175

             micro avg   0.836773  0.836773  0.836773       533
             macro avg   0.915020  0.605761  0.652020       533
          weighted avg   0.869370  0.836773  0.812852       533



In [9]:
print(test_y[0:10])

[0 0 5 5 0 3 2 2 5 0]
