# Multi-Label

Import libraries

In [15]:
import numpy as np
import pandas as pd
import re
import logging

from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

Define constants and initialize logging formatter

In [16]:
VOCAB_SIZE = 8520
TRAIN_SIZE = 8251
TEST_SIZE = 3983
labels = ['programming',
          'style',
          'reference',
          'java',
          'web',
          'internet',
          'culture',
          'design',
          'education',
          'language',
          'books',
          'writing',
          'computer',
          'english',
          'politics',
          'history',
          'philosophy',
          'science',
          'religion',
          'grammar']

TRAIN_FILE = 'data/train-data.dat'
TEST_FILE = 'data/test-data.dat'
LABEL_TRAIN_FILE = 'data/train-label.dat'
LABEL_TEST_FILE = 'data/test-label.dat'

# create formatter
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s.%(msecs)03d %(levelname)s - %(funcName)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
)

Function to read and preprocess input data

In [17]:
# Read values for data from the provided filename using the file size to initialize the data matrix
def ReadValues(filename, file_size):
    # Create new matrix filled with zero where rows are the number of documents
    # and columns are the number of words in the dictionary
    data = np.zeros((file_size, VOCAB_SIZE))
    # Initialize index
    index = 0
    with open(filename) as file:
        for line in file:
            # Split line into parts by whitespace
            parts = line.split()
            for part in parts:
                if part[0] != '<':
                    # Set 1 for this word in this document
                    data[index][int(part)] = 1
            index += 1
    return data

Function to read input labels

In [18]:
def ReadLabels(filename):
    data = pd.read_csv(filename, header=None, delim_whitespace=True, error_bad_lines=False).values
    return data

Function to read training and test data

In [19]:
def ReadData():
    # Read train data
    train_x = ReadValues(filename=TRAIN_FILE, file_size=TRAIN_SIZE)
    train_y = ReadLabels(filename=LABEL_TRAIN_FILE)

    # Read test data
    test_x = ReadValues(filename=TEST_FILE, file_size=TEST_SIZE)
    test_y = ReadLabels(filename=LABEL_TEST_FILE)

    # Return data
    return train_x, train_y, test_x, test_y

Function to run OneVsRestClassifier to create a model for every class vs all the other classes and then train a RandomForestClassifier for each one.

In [20]:
def PartA():
    logging.info("Reading data")
    x, y, test_x, test_y = ReadData()

    logging.info("Training model")
    clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=10))
    clf.fit(x, y)
    logging.info("Predicting on test data")
    y_predicted = clf.predict(test_x)

    logging.info("Results:")
    acc = accuracy_score(test_y, y_predicted)
    logging.info('Accuracy {0}:'.format(str(acc)))
    logging.info('For each label:')
    index = 0
    for label in labels:
        acc = accuracy_score(test_y[:, index], y_predicted[:, index])
        logging.info('Accuracy {0}: {1}'.format(label, str(acc)))
        index += 1

Run PartA function

In [21]:
PartA()

2019-05-17 22:01:30.910 INFO - PartA: Reading data
2019-05-17 22:01:31.968 INFO - PartA: Training model
2019-05-17 22:03:03.562 INFO - PartA: Predicting on test data
2019-05-17 22:03:06.760 INFO - PartA: Results:
2019-05-17 22:03:06.766 INFO - PartA: Accuracy 0.05699221692191815:
2019-05-17 22:03:06.767 INFO - PartA: For each label:
2019-05-17 22:03:06.768 INFO - PartA: Accuracy programming: 0.8312829525483304
2019-05-17 22:03:06.770 INFO - PartA: Accuracy style: 0.9470248556364549
2019-05-17 22:03:06.771 INFO - PartA: Accuracy reference: 0.6226462465478283
2019-05-17 22:03:06.772 INFO - PartA: Accuracy java: 0.9133818729600803
2019-05-17 22:03:06.774 INFO - PartA: Accuracy web: 0.7574692442882249
2019-05-17 22:03:06.776 INFO - PartA: Accuracy internet: 0.8659302033642983
2019-05-17 22:03:06.777 INFO - PartA: Accuracy culture: 0.8182274667336179
2019-05-17 22:03:06.779 INFO - PartA: Accuracy design: 0.7446648255084107
2019-05-17 22:03:06.781 INFO - PartA: Accuracy education: 0.81119758