# Data Exploration

## Load imports.

In [1]:
from collections import defaultdict
import numpy as np
from sklearn.naive_bayes import BernoulliNB

from reuters_parser import load_data
from sentence_utils import remove_stop_words_and_lemmatize
from term_document_matrix import TermDocumentMatrixCreator
from evaluation import calculate_classification_accuracy

## Useful lookup tables.

In [2]:
topic_code_to_topic_dict = {
    'GCRIM': 'CRIME, LAW ENFORCEMENT',
    'E11': 'ECONOMIC PERFORMANCE',
    'GVOTE': 'ELECTIONS',
    'GHEA': 'HEALTH',
    'GREL': 'RELIGION',
    'GSPO': 'SPORTS'
}

topic_code_to_int = {
    'GCRIM': 0,
    'E11': 1,
    'GVOTE': 2,
    'GHEA': 3,
    'GREL': 4,
    'GSPO': 5
}

int_to_topic_code = {
    0: 'GCRIM',
    1: 'E11',
    2: 'GVOTE',
    3: 'GHEA',
    4: 'GREL',
    5: 'GSPO'
}

## Load the train and test data.

In [4]:
def print_number_of_articles_per_topic(dataset, dataset_name):
    # Print out the number of documents in each category
    print('')
    print('------------------ {} ------------------'.format(dataset_name))
    print('')
    total_number = 0
    for topic_code, articles in dataset.items():
        print('Number of articles for topic {}: {}'.format(topic_code_to_topic_dict[topic_code], len(articles)))
        total_number += len(articles)
    print('')
    print('Total number of articles: {}'.format(total_number))

train_data = load_data('19960820', '19970819', '../../../../downloads/reuters/rcv1/', topic_code_to_topic_dict)
test_data = load_data('19960930', '19961002', '../../../../downloads/reuters/rcv1/', topic_code_to_topic_dict)

print_number_of_articles_per_topic(train_data, 'Train Data')
print_number_of_articles_per_topic(test_data, 'Test Data')


------------------ Train Data ------------------

Number of articles for topic SPORTS: 2421
Number of articles for topic ELECTIONS: 1115
Number of articles for topic ECONOMIC PERFORMANCE: 703
Number of articles for topic CRIME, LAW ENFORCEMENT: 2244
Number of articles for topic RELIGION: 238
Number of articles for topic HEALTH: 460

Total number of articles: 7181

------------------ Test Data ------------------

Number of articles for topic ECONOMIC PERFORMANCE: 123
Number of articles for topic SPORTS: 244
Number of articles for topic HEALTH: 57
Number of articles for topic CRIME, LAW ENFORCEMENT: 282
Number of articles for topic ELECTIONS: 105
Number of articles for topic RELIGION: 10

Total number of articles: 821


## Assess Bernoulli Naive Bayes baseline classification performance.

Lemmatize and remove stopwords from each news article.

In [5]:
def sanitise_each_topic(dataset):
    """
    Removes stop words and lemmatizes all articles for each topic.
    """
    data_sanitised = defaultdict(list)
    
    for topic_code, articles in dataset.items():
        for article in articles:
            article_sanitised = remove_stop_words_and_lemmatize(article)
            data_sanitised[topic_code].append(article_sanitised)
    
    return data_sanitised


train_data_sanitised = sanitise_each_topic(train_data)
test_data_sanitised = sanitise_each_topic(test_data)

Convert dictionaries to arrays.

In [6]:
np.random.seed(42)

def convert_dictionary_to_array(dataset):
    """
    Given a dataset dictionary keyed by topic code with items a list of the articles for that topic; 
    returns a tuple (x_data, y_data) where x_data is an array of all articles and y_data is the corresponding
    topic indexes from topic_code_to_int.  The data is also shuffled.
    """
    x_data = []
    y_data = []
    
    for topic_code, articles in dataset.items():
        x_data.extend(articles)
        y_data.extend([topic_code_to_int[topic_code]] * len(articles))
    
    # Randomly shuffle the dataset
    indices = np.arange(len(y_data))
    np.random.shuffle(indices)
    
    x_data = np.array(x_data)
    y_data = np.array(y_data)
    
    x_data = x_data[indices]
    y_data = y_data[indices]
    
    return (x_data, y_data)

train_x, train_y = convert_dictionary_to_array(train_data_sanitised)
test_x, test_y = convert_dictionary_to_array(test_data_sanitised)

Run Bernoulli Naive Bayes and report classification accuracy.

In [11]:
def run_bernoulli_naive_bayes(ngram_range):
    print('Word n-grams {}'.format(ngram_range))
    tdm_creator = TermDocumentMatrixCreator(train_x, ngram_range=ngram_range)
    train_tdm = tdm_creator.create_term_document_matrix(train_x)
    test_tdm = tdm_creator.create_term_document_matrix(test_x)

    naive_bayes = BernoulliNB()
    naive_bayes.fit(train_tdm, train_y)
    predict = naive_bayes.predict(test_tdm)

    per_class_accuracy = calculate_classification_accuracy(int_to_topic_code, predict, test_y)
    print(per_class_accuracy)
    print('')

# 1 word n-grams
run_bernoulli_naive_bayes(ngram_range = (1, 1))
# 1-2 word n-grams
run_bernoulli_naive_bayes(ngram_range = (1, 2))

Word n-grams (1, 1)
{'GCRIM': 0.9787234042553191, 'E11': 0.8943089430894309, 'GVOTE': 0.8095238095238095, 'GHEA': 0.40350877192982454, 'GREL': 0.2, 'GSPO': 0.9713114754098361}

Word n-grams (1, 2)
{'GCRIM': 0.9787234042553191, 'E11': 0.11382113821138211, 'GVOTE': 0.26666666666666666, 'GHEA': 0.0, 'GREL': 0.0, 'GSPO': 0.9918032786885246}

