# Multi-Instance

Import libraries

In [15]:
import numpy as np
import pandas as pd
import re
import logging

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

Define constants and initialize logging formatter

In [16]:
VOCAB_SIZE = 8520

MOST_FREQ = 2
MOST_FREQ_COUNT = 3181

TRAIN_FILE = 'data/train-data.dat'
TEST_FILE = 'data/test-data.dat'
LABEL_TRAIN_FILE = 'data/train-label.dat'
LABEL_TEST_FILE = 'data/test-label.dat'

SAMPLE_COEF = 100

# create formatter
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s.%(msecs)03d %(levelname)s - %(funcName)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
)

Function to read and preprocess input data

In [17]:
# Read values for data from the provided file for the x and y values
def ReadValues(x_filename, y_filename):

    file_ids = FileIDs(x_filename)

    # Read labels array
    y_data = pd.read_csv(y_filename, header=None, delim_whitespace=True,
                         error_bad_lines=False).values

    # Create new array to hold the transformed data
    x_data = []
    y_data_transformed = []

    index = 0
    with open(x_filename) as file:
        for line in file:
            if index % SAMPLE_COEF == 0:

                # Find the file id for this line
                file = int(re.findall('^<([0-9]*)>', line)[0])

                sentence = np.zeros((VOCAB_SIZE + 1), dtype=np.int32)
                sentence[-1] = file

                # Find label
                if y_data[index][MOST_FREQ] == 1:
                    label = 1
                else:
                    label = 0

                # Split line into parts by whitespace
                parts = line.split()
                file_id = True
                for part in parts:
                    if part[0] != '<':
                        # Set 1 for this word in this sentence
                        sentence[int(part)] = 1
                    else:
                        if file_id is False:
                            x_data.append(sentence)
                            y_data_transformed.append(label)
                            sentence = np.zeros((VOCAB_SIZE + 1), dtype=np.int32)
                            sentence[-1] = file
                        else:
                            file_id = False
            # Increase index
            index += 1
    return x_data, y_data_transformed

Calculate the predicted labels for each document

In [18]:
def CalculateResults(data, results):
    file_results = {}
    index = 0
    for result in results:
        file_id = str(data[index][-1])
        if file_id in file_results:
            file_results[file_id][result] += 1
        else:
            file_results[file_id] = [0, 0]
        index += 1
    return file_results

Function that returns the ids of all documents contained in the data

In [19]:
def FileIDs(filename):
    doc_ids = {}
    index = 0
    with open(filename) as file:
        for line in file:
            # Find the file id for this line
            file = re.findall('^<([0-9]*)>', line)[0]
            if file not in doc_ids:
                doc_ids[file] = index
                index += 1
    return doc_ids

Function to run OneVsRestClassifier to create a model for every class vs all the other classes and then train a RandomForestClassifier for each one.

In [20]:
def PartB():
    logging.info("Reading data")
    # Read train data
    train_x, train_y = ReadValues(x_filename=TRAIN_FILE, y_filename=LABEL_TRAIN_FILE)

    # Read test data
    test_x, test_y = ReadValues(x_filename=TEST_FILE, y_filename=LABEL_TEST_FILE)

    logging.info("Train SVM model")
    model = SVC(gamma='auto', class_weight="balanced")
    model.fit(train_x, train_y)

    logging.info("Predict test data")
    y_predicted = model.predict(test_x)
    acc = accuracy_score(test_y, y_predicted)
    logging.info('Accuracy {0}:'.format(str(acc)))

    logging.info(CalculateResults(test_x, y_predicted))

Run PartA function

In [21]:
PartB()

2019-05-18 00:04:54.507 INFO - PartB: Reading data
2019-05-18 00:04:54.741 INFO - PartB: Train SVM model
2019-05-18 00:05:09.313 INFO - PartB: Predict test data
2019-05-18 00:05:16.736 INFO - PartB: Accuracy 0.5986577181208054:
2019-05-18 00:05:16.738 INFO - PartB: {'31': [371, 0], '20': [39, 0], '4': [0, 11], '8': [0, 7], '19': [18, 0], '10': [0, 19], '12': [0, 23], '26': [25, 0], '29': [28, 0], '18': [0, 17], '5': [0, 14], '22': [21, 0], '30': [29, 0], '9': [0, 17], '16': [0, 15], '3': [0, 5], '25': [24, 0], '24': [23, 0], '14': [0, 13], '7': [0, 6]}
