# Multi-Label-Learning for DeliciousMIL

## Imports

In [1]:
import os
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import zero_one_loss, coverage_error, label_ranking_loss, label_ranking_average_precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

## Data Preprocessing

In [2]:
my_path = os.getcwd()
dataset_dir = my_path + '/DeliciousMIL/Data/'
dataset_dir = dataset_dir.replace('\\', '/')

input_train_data = dataset_dir + 'train-data.dat'
output_train_data = dataset_dir + 'train-data.txt'
output_train_data2 = dataset_dir + 'train-data2.txt'
input_test_data = dataset_dir + 'test-data.dat'
output_test_data = dataset_dir + 'test-data.txt'
output_test_data2 = dataset_dir + 'test-data2.txt'

input_train_label = dataset_dir + 'train-label.dat'
output_train_label = dataset_dir + 'train-label.txt'
output_train_label2 = dataset_dir + 'train-label2.txt'
input_test_label = dataset_dir + 'test-label.dat'
output_test_label = dataset_dir + 'test-label.txt'
output_test_label2 = dataset_dir + 'test-label2.txt'

In [3]:
def remove_tags(input_file, output_file):
    with open(input_file, 'r') as file:
        content = file.read()

    pattern = r'<[^>]+>'
    content = re.sub(pattern, '', content)

    with open(output_file, 'w') as file:
        file.write(content)


def replace_multiple_spaces(input_file, output_file):
    with open(output_file, 'w') as file2:
        with open(input_file, 'r') as file:
            for line in file:
                content = line
                content = ' '.join(content.split())

                file2.write(content)
                file2.write("\n")

In [4]:
corpus_X_train = []
with open(output_train_data2, 'r') as file:
    for line in file:
        pattern = r'\n'
        line = re.sub(pattern, '', line)
        corpus_X_train.append(line)
        
corpus_X_test = []
with open(output_test_data2, 'r') as file:
    for line in file:
        pattern = r'\n'
        line = re.sub(pattern, '', line)
        corpus_X_test.append(line)
        
corpus_y_train = []
with open(output_train_label2, 'r') as file:
    for line in file:
        pattern = r'\n'
        line = re.sub(pattern, '', line)
        content = line.split(" ")

        for i in range(0, len(content)):
            content[i] = int(content[i])

        content = np.array(content)

        corpus_y_train.append(content)

corpus_y_test = []
with open(output_test_label2, 'r') as file:
    for line in file:
        pattern = r'\n'
        line = re.sub(pattern, '', line)
        content = line.split(" ")

        for i in range(0, len(content)):
            content[i] = int(content[i])

        corpus_y_test.append(content)

TfidfVectorizer is used for the vectorization of training and test set.

In [5]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(corpus_X_train)
X_test = vectorizer.transform(corpus_X_test)
y_train = np.array(corpus_y_train)
y_test = np.array(corpus_y_test)

## Comparison between the following multi-label models:
<ou>
    <li>Classifier Chain (Logistic Regression)</li>
    <li>Ensemble Method (Random Forest Classifier)</li>
</ou>

In [6]:
names = ['CC', 'RF']
classifiers = [ClassifierChain(LogisticRegression(random_state=0)),
               RandomForestClassifier(n_estimators=200),
               ]

In [7]:
for name, clf in zip(names, classifiers):
  print(name)
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)

  print('MSE: %.2f' % (mean_squared_error(y_test, y_pred)))
  print('MAE: %.2f' % (mean_absolute_error(y_test, y_pred)))
  print(' R2: %.2f' % (r2_score(y_test, y_pred)))

  print(classification_report(y_test,y_pred,zero_division='warn'))
  print('subset accuracy   : %.2f' % (1-zero_one_loss(y_test, y_pred)))

  if name == 'CC' or name == 'MLP':
    y_proba = clf.predict_proba(X_test)
  else:
    y_proba = np.array([[k[1] for k in i] for i in clf.predict_proba(X_test)]).T
 
  print('coverage error    : %.2f' % (coverage_error(y_test, y_proba)))
  print('ranking loss      : %.2f' % (label_ranking_loss(y_test, y_proba)))
  print('average precision : %.2f' % (label_ranking_average_precision_score(y_test, y_proba)))
  

CC
MSE: 0.12
MAE: 0.12
 R2: -0.00
              precision    recall  f1-score   support

           0       0.85      0.58      0.69       977
           1       0.94      0.14      0.24       228
           2       0.67      0.39      0.49      1558
           3       0.81      0.47      0.60       372
           4       0.71      0.37      0.48      1050
           5       0.48      0.14      0.22       537
           6       0.61      0.12      0.20       702
           7       0.76      0.26      0.39      1079
           8       0.71      0.30      0.42       803
           9       0.78      0.27      0.40       483
          10       0.73      0.21      0.33       507
          11       0.86      0.13      0.23       478
          12       0.61      0.09      0.15       509
          13       0.70      0.17      0.27       355
          14       0.76      0.24      0.36       392
          15       0.72      0.06      0.12       441
          16       0.66      0.09      0.16    

### Results

Reviewing the results above, it's clear that the Logistic Regression model shows slightly superior performance in terms of MSE, MAE, and R2. Additionally, the model outperforms in subset accuracy, coverage error, ranking loss, and average precision.