# Unsupervised Classification

## Load imports.

In [1]:
# Make common scripts visible and unsupervised classifier code
import sys
sys.path.append('../common/')
sys.path.append('../kb-classifier/')

import numpy as np
from collections import defaultdict

from reuters_parser import load_data
from lookup_tables import topic_code_to_topic_dict, topic_code_to_int
from sentence_utils import remove_stop_words_and_lemmatize
from conversion import convert_dictionary_to_array
from classifier_runner import run_unsupervised_classifier

## Load the data.

In [2]:
def sanitise_each_topic(dataset):
    """
    We only need to remove stop words.
    """
    data_sanitised = defaultdict(list)
    
    for topic_code, articles in dataset.items():
        for article in articles:
            article_sanitised = remove_stop_words_and_lemmatize(article, lowercase=False, lemmatize=False)
            data_sanitised[topic_code].append(article_sanitised)
    
    return data_sanitised

#year_data = load_data('19960820', '19970819', '../../../downloads/reuters/rcv1/', topic_code_to_topic_dict)
year_data = load_data('19960820', '19960830', '../../../downloads/reuters/rcv1/', topic_code_to_topic_dict)

year_data_sanitised = sanitise_each_topic(year_data)

For accurate comparison with the Naive Bayes classifier, keep the last 20% of documents using the same random seed.  I.e. we are making predictions on the same test set.

In [3]:
np.random.seed(42)

# Split data into 80% train, 20% test
x, y = convert_dictionary_to_array(year_data_sanitised, topic_code_to_int)
total_examples = len(y)
split_point = int(total_examples * 0.8)
test_x = x[split_point:]
test_y = y[split_point:]

print('Making predictions for {} documents'.format(len(test_y)))

Making predictions for 533 documents


## Assess unsupervised classifier performance.

In [4]:
wiki_topics_to_actual_topics = {
    'Crime': 0,
    'Law': 0,
    'Business': 1,
    'Economics': 1,
    'Elections': 2,
    'Politics': 2,
    'Health': 3,
    'Medicine': 3,
    'Religion': 4,
    'Theology': 4,
    'Sports': 5
}

report = run_unsupervised_classifier(test_x,
                                     test_y, 
                                     topic_code_to_topic_dict.values(),
                                     wiki_topics_to_actual_topics)

KeyError: 0

In [None]:
set([1,2,1,3,4,2])