# Data Exploration

## Load imports.

In [1]:
from collections import defaultdict
import numpy as np
from sklearn.naive_bayes import BernoulliNB

from reuters_parser import load_data
from sentence_utils import remove_stop_words_and_lemmatize
from term_document_matrix import TermDocumentMatrixCreator
from evaluation import calculate_classification_accuracy

## Useful lookup tables.

In [2]:
topic_code_to_topic_dict = {
    'GCRIM': 'CRIME, LAW ENFORCEMENT',
    'E11': 'ECONOMIC PERFORMANCE',
    'GVOTE': 'ELECTIONS',
    'GHEA': 'HEALTH',
    'GREL': 'RELIGION',
    'GSPO': 'SPORTS'
}

topic_code_to_int = {
    'GCRIM': 0,
    'E11': 1,
    'GVOTE': 2,
    'GHEA': 3,
    'GREL': 4,
    'GSPO': 5
}

int_to_topic_code = {
    0: 'GCRIM',
    1: 'E11',
    2: 'GVOTE',
    3: 'GHEA',
    4: 'GREL',
    5: 'GSPO'
}

## Load the train and test data.

Load the articles.

In [12]:
def print_number_of_articles_per_topic(dataset, dataset_name):
    # Print out the number of documents in each category
    print('')
    print('------------------ {} ------------------'.format(dataset_name))
    print('')
    total_number = 0
    for topic_code, articles in dataset.items():
        print('Number of articles for topic {}: {}'.format(topic_code_to_topic_dict[topic_code], len(articles)))
        total_number += len(articles)
    print('')
    print('Total number of articles: {}'.format(total_number))

year_data = load_data('19960820', '19970819', '../../../../downloads/reuters/rcv1/', topic_code_to_topic_dict)

print_number_of_articles_per_topic(year_data, 'Data for a Year August 96 to August 97')


------------------ Data for a Year Aug 96 to Aug 97 ------------------

Number of articles for topic CRIME, LAW ENFORCEMENT: 30276
Number of articles for topic SPORTS: 35200
Number of articles for topic RELIGION: 2287
Number of articles for topic ELECTIONS: 10940
Number of articles for topic ECONOMIC PERFORMANCE: 8452
Number of articles for topic HEALTH: 4999

Total number of articles: 92154


Lemmatize and remove stopwords from each news article.

In [15]:
def sanitise_each_topic(dataset):
    """
    Removes stop words and lemmatizes all articles for each topic.
    """
    data_sanitised = defaultdict(list)
    
    for topic_code, articles in dataset.items():
        for article in articles:
            article_sanitised = remove_stop_words_and_lemmatize(article)
            data_sanitised[topic_code].append(article_sanitised)
    
    return data_sanitised


year_data_sanitised = sanitise_each_topic(year_data)

Convert dictionary to array.

In [16]:
np.random.seed(42)

def convert_dictionary_to_array(dataset):
    """
    Given a dataset dictionary keyed by topic code with items a list of the articles for that topic; 
    returns a tuple (x_data, y_data) where x_data is an array of all articles and y_data is the corresponding
    topic indexes from topic_code_to_int.  The data is also shuffled.
    """
    x_data = []
    y_data = []
    
    for topic_code, articles in dataset.items():
        x_data.extend(articles)
        y_data.extend([topic_code_to_int[topic_code]] * len(articles))
    
    # Randomly shuffle the dataset
    indices = np.arange(len(y_data))
    np.random.shuffle(indices)
    
    x_data = np.array(x_data)
    y_data = np.array(y_data)
    
    x_data = x_data[indices]
    y_data = y_data[indices]
    
    return (x_data, y_data)


# Split data into 80% train, 20% test
x, y = convert_dictionary_to_array(year_data_sanitised)
total_examples = len(y)
split_point = int(total_examples * 0.8)
train_x = x[:split_point]
train_y = y[:split_point]
test_x = x[split_point:]
test_y = y[split_point:]

## Assess Bernoulli Naive Bayes baseline classification performance.

Run Bernoulli Naive Bayes and report classification accuracy.

In [17]:
def run_bernoulli_naive_bayes(ngram_range):
    print('Word n-grams {}'.format(ngram_range))
    tdm_creator = TermDocumentMatrixCreator(train_x, ngram_range=ngram_range)
    train_tdm = tdm_creator.create_term_document_matrix(train_x)
    test_tdm = tdm_creator.create_term_document_matrix(test_x)

    naive_bayes = BernoulliNB()
    naive_bayes.fit(train_tdm, train_y)
    predict = naive_bayes.predict(test_tdm)

    per_class_accuracy = calculate_classification_accuracy(int_to_topic_code, predict, test_y)
    print(per_class_accuracy)
    print('')

# 1 word n-grams
run_bernoulli_naive_bayes(ngram_range = (1, 1))
# 1-2 word n-grams
run_bernoulli_naive_bayes(ngram_range = (1, 2))

Word n-grams (1, 1)
{'GCRIM': 0.9634989548158868, 'E11': 0.9501779359430605, 'GVOTE': 0.8431641518061271, 'GHEA': 0.7994041708043694, 'GREL': 0.5161290322580645, 'GSPO': 0.9565091330820528}

Word n-grams (1, 2)
{'GCRIM': 0.9868146004180737, 'E11': 0.3232502965599051, 'GVOTE': 0.4288980338363054, 'GHEA': 0.0019860973187686196, 'GREL': 0.0, 'GSPO': 0.9878225572629747}

