# Data Exploration

Load imports.

In [21]:
from collections import defaultdict

from reuters_parser import load_data
from sentence_utils import remove_stop_words_and_lemmatize

Useful lookup tables.

In [2]:
code_to_topic_dict = {
    'GCRIM': 'CRIME, LAW ENFORCEMENT',
    'E11': 'ECONOMIC PERFORMANCE',
    'GVOTE': 'ELECTIONS',
    'GHEA': 'HEALTH',
    'GREL': 'RELIGION',
    'GSPO': 'SPORTS'
}

Load the train and test data.

In [29]:
def print_number_of_articles_per_topic(dataset, dataset_name):
    # Print out the number of documents in each category
    print('')
    print('------------------ {} ------------------'.format(dataset_name))
    print('')
    total_number = 0
    for topic_code, articles in dataset.items():
        print('Number of articles for topic {}: {}'.format(code_to_topic_dict[topic_code], len(articles)))
        total_number += len(articles)
    print('')
    print('Total number of articles: {}'.format(total_number))

train_data = load_data('19960820', '19960825', '../data/train/', code_to_topic_dict)
test_data = load_data('19960826', '19960902', '../data/test/', code_to_topic_dict)

print_number_of_articles_per_topic(train_data, 'Train Data')
print_number_of_articles_per_topic(test_data, 'Test Data')


------------------ Train Data ------------------

Number of articles for topic CRIME, LAW ENFORCEMENT: 470
Number of articles for topic SPORTS: 436
Number of articles for topic RELIGION: 26
Number of articles for topic ELECTIONS: 132
Number of articles for topic ECONOMIC PERFORMANCE: 103
Number of articles for topic HEALTH: 69

Total number of articles: 1236

------------------ Test Data ------------------

Number of articles for topic CRIME, LAW ENFORCEMENT: 666
Number of articles for topic RELIGION: 54
Number of articles for topic SPORTS: 649
Number of articles for topic HEALTH: 101
Number of articles for topic ECONOMIC PERFORMANCE: 205
Number of articles for topic ELECTIONS: 238

Total number of articles: 1913


Assess Bernoulli Naive Bayes baseline classification performance.

In [30]:
def sanitise_each_topic(dataset):
    """
    Removes stop words and lemmatizes all articles for each topic.
    """
    data_sanitised = defaultdict(list)
    
    for topic_code, articles in dataset.items():
        for article in articles:
            article_sanitised = remove_stop_words_and_lemmatize(article)
            data_sanitised[topic_code].append(article_sanitised)
    
    return data_sanitised
        

train_data_sanitised = sanitise_each_topic(train_data)
test_data_sanitised = sanitise_each_topic(test_data)