# Data Exploration

## Load imports.

In [40]:
from collections import defaultdict
import numpy as np

from reuters_parser import load_data
from sentence_utils import remove_stop_words_and_lemmatize
from term_document_matrix import TermDocumentMatrix

## Useful lookup tables.

In [32]:
topic_code_to_topic_dict = {
    'GCRIM': 'CRIME, LAW ENFORCEMENT',
    'E11': 'ECONOMIC PERFORMANCE',
    'GVOTE': 'ELECTIONS',
    'GHEA': 'HEALTH',
    'GREL': 'RELIGION',
    'GSPO': 'SPORTS'
}

topic_code_to_int = {
    'GCRIM': 0,
    'E11': 1,
    'GVOTE': 2,
    'GHEA': 3,
    'GREL': 4,
    'GSPO': 5
}

int_to_topic_code = {
    0: 'GCRIM',
    1: 'E11',
    2: 'GVOTE',
    3: 'GHEA',
    4: 'GREL',
    5: 'GSPO'
}

## Load the train and test data.

In [29]:
def print_number_of_articles_per_topic(dataset, dataset_name):
    # Print out the number of documents in each category
    print('')
    print('------------------ {} ------------------'.format(dataset_name))
    print('')
    total_number = 0
    for topic_code, articles in dataset.items():
        print('Number of articles for topic {}: {}'.format(topic_code_to_topic_dict[topic_code], len(articles)))
        total_number += len(articles)
    print('')
    print('Total number of articles: {}'.format(total_number))

train_data = load_data('19960820', '19960825', '../data/train/', topic_code_to_topic_dict)
test_data = load_data('19960826', '19960902', '../data/test/', topic_code_to_topic_dict)

print_number_of_articles_per_topic(train_data, 'Train Data')
print_number_of_articles_per_topic(test_data, 'Test Data')


------------------ Train Data ------------------

Number of articles for topic CRIME, LAW ENFORCEMENT: 470
Number of articles for topic SPORTS: 436
Number of articles for topic RELIGION: 26
Number of articles for topic ELECTIONS: 132
Number of articles for topic ECONOMIC PERFORMANCE: 103
Number of articles for topic HEALTH: 69

Total number of articles: 1236

------------------ Test Data ------------------

Number of articles for topic CRIME, LAW ENFORCEMENT: 666
Number of articles for topic RELIGION: 54
Number of articles for topic SPORTS: 649
Number of articles for topic HEALTH: 101
Number of articles for topic ECONOMIC PERFORMANCE: 205
Number of articles for topic ELECTIONS: 238

Total number of articles: 1913


## Assess Bernoulli Naive Bayes baseline classification performance.

Lemmatize and remove stopwords from each news article.

In [30]:
def sanitise_each_topic(dataset):
    """
    Removes stop words and lemmatizes all articles for each topic.
    """
    data_sanitised = defaultdict(list)
    
    for topic_code, articles in dataset.items():
        for article in articles:
            article_sanitised = remove_stop_words_and_lemmatize(article)
            data_sanitised[topic_code].append(article_sanitised)
    
    return data_sanitised


train_data_sanitised = sanitise_each_topic(train_data)
test_data_sanitised = sanitise_each_topic(test_data)

Convert dictionaries to arrays.

In [53]:
np.random.seed(42)

def convert_dictionary_to_array(dataset):
    """
    Given a dataset dictionary keyed by topic code with items a list of the articles for that topic; 
    returns a tuple (x_data, y_data) where x_data is an array of all articles and y_data is the corresponding
    topic indexes from topic_code_to_int.  The data is also shuffled.
    """
    x_data = []
    y_data = []
    
    for topic_code, articles in dataset.items():
        x_data.extend(articles)
        y_data.extend([topic_code_to_int[topic_code]] * len(articles))
    
    # Randomly shuffle the dataset
    indices = np.arange(len(y_data))
    np.random.shuffle(indices)
    
    x_data = np.array(x_data)
    y_data = np.array(y_data)
    
    x_data = x_data[indices]
    y_data = y_data[indices]
    
    return (x_data, y_data)

train_x, train_y = convert_dictionary_to_array(train_data_sanitised)

[0 5 3 ... 1 5 1]
