In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import json

from sklearn import svm, ensemble
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
import sklearn.multiclass

from sklearn_hierarchical_classification.classifier import HierarchicalClassifier
from sklearn_hierarchical_classification.constants import ROOT
from sklearn_hierarchical_classification.metrics import h_fbeta_score, multi_labeled

In [2]:
path_to_train_data = "data/imclef07d/imclef07d_train.csv"
path_to_test_data = "data/imclef07d/imclef07d_test.csv"

path_to_data_hf = "data/imclef07d/imclef07d.hf"

path_to_class_hierarchy = 'data/imclef07d/class_hierarchy.json'

In [3]:
df_train = pd.read_csv(path_to_train_data, index_col=0)
df_train['labels'] = df_train['labels'].apply(lambda x: x.split(','))
df_train.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,72,73,74,75,76,77,78,79,80,labels
0,0.596529,-0.284171,0.254657,-0.1629,-0.007383,0.009371,-0.024586,-0.132,-0.487257,-0.0217,...,0.210829,0.523171,0.275814,0.708057,0.174729,0.0073,0.440557,-0.407543,0.166243,"[15, D, 2B]"
1,0.0251,0.001543,0.397514,0.122814,-0.007383,-0.133486,0.403986,0.010857,0.084171,0.144967,...,0.353686,0.380314,0.275814,0.422343,0.317586,-0.135557,0.154843,0.592457,0.166243,"[15, D, 2B]"
2,-0.260614,0.001543,0.540371,0.408529,-0.007383,0.009371,0.118271,0.010857,0.084171,0.144967,...,0.210829,0.237457,0.418671,0.422343,0.174729,0.0073,0.583414,0.306743,0.023386,"[15, D, 2B]"
3,-0.403471,-0.284171,0.1118,-0.1629,0.159283,-0.133486,-0.024586,-0.132,0.227029,-0.188367,...,0.210829,0.237457,-0.295614,-0.149086,-0.3967,0.0073,0.726271,-0.121829,0.023386,"[15, D, 2B]"
4,0.167957,-0.141314,0.254657,0.122814,-0.007383,-0.4192,0.118271,-0.560571,-0.3444,-0.188367,...,-0.074886,0.380314,0.418671,-0.006229,-0.539557,-0.135557,0.011986,-0.264686,-0.119471,"[15, D, 2B]"


In [4]:
df_test = pd.read_csv(path_to_test_data, index_col=0)
df_test['labels'] = df_test['labels'].apply(lambda x: x.split(','))
df_test.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,72,73,74,75,76,77,78,79,80,labels
0,-0.208179,-0.260721,0.531241,-0.314825,-0.012757,-0.261573,0.290116,0.445754,-0.359273,0.177734,...,0.643283,0.374751,0.428287,0.332671,-0.245385,0.297785,0.720676,0.451008,0.433598,"[15, D, 2B]"
1,0.077535,-0.117864,-0.040187,-0.171968,-0.179423,0.166998,0.147259,0.445754,-0.073559,0.177734,...,0.214712,0.231894,0.571145,0.332671,-0.388242,0.01207,0.434962,-0.406135,0.433598,"[15, D, 2B]"
2,0.077535,0.16785,-0.040187,-0.029111,-0.012757,-0.261573,0.147259,-0.411389,-0.073559,-0.222266,...,0.357569,0.517609,0.428287,0.666004,0.183187,0.01207,0.149247,0.165294,0.233598,"[15, D, 2B]"
3,-0.065322,-0.117864,-0.183045,0.542318,0.15391,0.024141,0.004402,0.017183,-0.216416,0.377734,...,0.071855,0.231894,0.571145,0.499337,-0.531099,0.01207,0.149247,-0.406135,0.033598,"[15, D, 2B]"
4,0.648963,0.024993,-0.040187,-0.029111,0.320577,0.309855,-0.281312,-0.268532,-0.50213,-0.022266,...,-0.071003,-0.05382,-0.000284,-0.000663,-0.388242,0.01207,-0.279324,-0.406135,-0.166402,"[15, D, 2B]"


In [5]:
with open(path_to_class_hierarchy, 'r') as f:
    class_hierarchy = json.load(f)

class_hierarchy[ROOT] = class_hierarchy['19'] 
class_hierarchy.pop('19')
pass

In [6]:
class_hierarchy

{'15': ['16', 'D'],
 '16': ['1D', '1F', '22', '23', '2A'],
 'D': ['E', 'F', '13', '2B'],
 '1A': ['1B', '1C'],
 '1B': ['1E'],
 '1C': ['2D'],
 '2': ['3', '7', 'A', '5', 'B'],
 '3': ['4'],
 '7': ['8'],
 'A': ['26', '27'],
 '5': ['6', '9', 'C'],
 'B': ['2C'],
 '10': ['11', '12', '14', '17', '18', '0', '24'],
 '11': ['20'],
 '12': ['2E'],
 '14': ['29'],
 '17': ['28'],
 '18': ['21'],
 '0': ['1'],
 '24': ['25'],
 '<ROOT>': ['15', '1A', '2', '10']}

In [7]:
y_train = df_train['labels']
X_train = df_train.drop(columns=['labels'])

y_test = df_test['labels']
X_test = df_test.drop(columns=['labels'])

In [8]:
# y_train.head(), X_train.head()

# Hierarchical classification - LCPN

LCPN - Local Classifier per Parent Node

In [9]:
base_estimator = make_pipeline(
    TruncatedSVD(n_components=30),
    svm.SVC(
        gamma='scale',
        kernel="sigmoid",
        probability=True
    ),
)
clf_lcpn = HierarchicalClassifier(
    base_estimator=base_estimator,
    class_hierarchy=class_hierarchy,
    algorithm='lcpn'
)

In [11]:
%%time

clf_lcpn.fit(X_train, y_train.apply(lambda x: x[-1]))

CPU times: user 47.4 s, sys: 3.5 s, total: 50.9 s
Wall time: 46.3 s


HierarchicalClassifier(algorithm='lcpn',
                       base_estimator=Pipeline(memory=None,
                                               steps=[('truncatedsvd',
                                                       TruncatedSVD(algorithm='randomized',
                                                                    n_components=30,
                                                                    n_iter=5,
                                                                    random_state=None,
                                                                    tol=0.0)),
                                                      ('svc',
                                                       SVC(C=1.0,
                                                           break_ties=False,
                                                           cache_size=200,
                                                           class_weight=None,
                                                 

In [12]:
y_pred = clf_lcpn.predict(X_test)

In [13]:
print("Classification Report:\n", classification_report(y_test.apply(lambda x: x[-1]), y_pred))

Classification Report:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00        13
          13       0.00      0.00      0.00         5
          1D       0.00      0.00      0.00         1
          1E       0.00      0.00      0.00        12
          1F       0.62      0.94      0.75       198
          20       0.00      0.00      0.00        13
          21       0.00      0.00      0.00         4
          22       0.37      0.85      0.51       255
          23       0.25      0.50      0.33         8
          25       0.00      0.00      0.00         2
          26       0.07      0.02      0.03        59
          27       0.93      0.15      0.26        87
          28       0.00      0.00      0.00        15
          29       0.00      0.00      0.00        10
          2A       0.00      0.00      0.00         3
          2B       0.31      0.03      0.05       154
          2C       0.09      0.05      0.07        57
   

  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
with multi_labeled(y_test.apply(lambda x: x[-1]), y_pred, clf_lcpn.graph_) as (y_test_, y_pred_, graph_):
    h_fbeta = h_fbeta_score(
        y_test_,
        y_pred_,
        graph_,
    )
    print("h_fbeta_score: ", h_fbeta)

h_fbeta_score:  0.6571479564519008


# Hierarchical classification - LCN

LCN - Local Classifier per Node - Binary classification for each node in class hierarchy

In [15]:
mlb = sklearn.preprocessing.MultiLabelBinarizer()


In [16]:
clf_lcn = sklearn.multiclass.OneVsRestClassifier(estimator=base_estimator)

In [17]:
%%time

clf_lcn.fit(X_train, mlb.fit_transform(y_train))

CPU times: user 4min 4s, sys: 11.5 s, total: 4min 15s
Wall time: 3min 57s


OneVsRestClassifier(estimator=Pipeline(memory=None,
                                       steps=[('truncatedsvd',
                                               TruncatedSVD(algorithm='randomized',
                                                            n_components=30,
                                                            n_iter=5,
                                                            random_state=None,
                                                            tol=0.0)),
                                              ('svc',
                                               SVC(C=1.0, break_ties=False,
                                                   cache_size=200,
                                                   class_weight=None, coef0=0.0,
                                                   decision_function_shape='ovr',
                                                   degree=3, gamma='scale',
                                                   kernel='sigmoid',

In [18]:
y_pred = clf_lcn.predict(X_test)
# y_pred = mlb.inverse_transform(y_pred)

In [19]:
print("Classification Report:\n", classification_report(mlb.transform(y_test), y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        13
           1       0.00      0.00      0.00        13
           2       0.07      0.05      0.05        66
           3       0.00      0.00      0.00        13
           4       0.00      0.00      0.00         9
           5       0.00      0.00      0.00         5
           6       0.33      0.10      0.15        10
           7       0.72      0.71      0.72       642
           8       0.43      0.41      0.42       465
           9       0.00      0.00      0.00        15
          10       0.00      0.00      0.00         4
          11       0.24      0.29      0.26        21
          12       0.00      0.00      0.00        12
          13       0.86      0.67      0.75         9
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00        12
          16       0.59      0.60      0.59       198
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
with multi_labeled(y_test, mlb.inverse_transform(y_pred), clf_lcpn.graph_) as (y_test_, y_pred_, graph_):
    h_fbeta = h_fbeta_score(
        y_test_,
        y_pred_,
        graph_,
    )
    print("h_fbeta_score: ", h_fbeta)

h_fbeta_score:  0.47195357833655704


# Hierarchical classification - LCL

In [21]:
clf_lcl1 = sklearn.base.clone(base_estimator)
clf_lcl2 = sklearn.base.clone(base_estimator)
clf_lcl3 = sklearn.base.clone(base_estimator)

In [22]:
%%time

clf_lcl1.fit(X_train, y_train.apply(lambda x: x[0]))
clf_lcl2.fit(X_train, y_train.apply(lambda x: x[1]))
clf_lcl3.fit(X_train, y_train.apply(lambda x: x[2]))

CPU times: user 1min 15s, sys: 797 ms, total: 1min 16s
Wall time: 1min 15s


Pipeline(memory=None,
         steps=[('truncatedsvd',
                 TruncatedSVD(algorithm='randomized', n_components=30, n_iter=5,
                              random_state=None, tol=0.0)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='sigmoid', max_iter=-1,
                     probability=True, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [23]:
def predict_lcl(clf1, clf2, clf3, X_test):
    y_pred1 = clf1.predict(X_test)
    y_pred2 = clf2.predict(X_test)
    y_pred3 = clf3.predict(X_test)
    return list(zip(y_pred1, y_pred2, y_pred3))

y_pred = predict_lcl(clf_lcl1, clf_lcl2, clf_lcl3, X_test)

In [24]:
# print("Classification Report:\n", classification_report(y_test, y_pred))

In [25]:
with multi_labeled(y_test, y_pred, clf_lcpn.graph_) as (y_test_, y_pred_, graph_):
    h_fbeta = h_fbeta_score(
        y_test_,
        y_pred_,
        graph_,
    )
    print("h_fbeta_score: ", h_fbeta)

h_fbeta_score:  0.5886095122690093


# Big Bang classification

In [26]:
%%time

clf_big_bang = make_pipeline(
            TruncatedSVD(n_components=30),
            sklearn.ensemble.RandomForestClassifier()
        )
mlb = sklearn.preprocessing.MultiLabelBinarizer()
clf_big_bang.fit(X_train, mlb.fit_transform(y_train))
y_pred = clf_big_bang.predict(X_test)
y_pred = mlb.inverse_transform(y_pred)

CPU times: user 59.5 s, sys: 478 ms, total: 60 s
Wall time: 1min 1s


In [27]:
with multi_labeled(y_test, y_pred, clf_lcpn.graph_) as (y_test_, y_pred_, graph_):
    h_fbeta = h_fbeta_score(
        y_test_,
        y_pred_,
        graph_,
    )
    print("h_fbeta_score: ", h_fbeta)

h_fbeta_score:  0.7341425215348474


# Flat classification

In [28]:
clf_flat = base_estimator

In [29]:
%%time

clf_flat = clf_flat.fit(X_train, y_train.apply(lambda x: x[-1]))

CPU times: user 38.2 s, sys: 232 ms, total: 38.4 s
Wall time: 39.6 s


In [31]:
y_pred = clf_flat.predict(X_test)

In [32]:
print("Classification Report:\n", classification_report(y_test.apply(lambda x: x[-1]), y_pred))

Classification Report:
               precision    recall  f1-score   support

           1       0.25      0.08      0.12        13
          13       0.00      0.00      0.00         5
          1D       0.00      0.00      0.00         1
          1E       0.18      0.33      0.24        12
          1F       0.76      0.84      0.80       198
          20       0.12      0.23      0.15        13
          21       0.75      0.75      0.75         4
          22       0.47      0.71      0.56       255
          23       0.60      0.38      0.46         8
          25       0.00      0.00      0.00         2
          26       0.50      0.29      0.37        59
          27       0.92      0.89      0.90        87
          28       0.67      0.13      0.22        15
          29       0.80      0.40      0.53        10
          2A       0.00      0.00      0.00         3
          2B       0.51      0.42      0.46       154
          2C       0.22      0.11      0.14        57
   

  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
with multi_labeled(y_test.apply(lambda x: x[-1]), y_pred, clf_lcpn.graph_) as (y_test_, y_pred_, graph_):
    h_fbeta = h_fbeta_score(
        y_test_,
        y_pred_,
        graph_,
    )
    print("h_fbeta_score: ", h_fbeta)

h_fbeta_score:  0.7125176803394627
