In [None]:
from copy import deepcopy
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.linear_model import ElasticNetCV, LinearRegression, RidgeCV, LassoCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.utils.validation import check_is_fitted
from sklearn.utils import check_array
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_X_y
from sklearn.utils.validation import _check_sample_weight
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from tqdm import tqdm
from collections import defaultdict
import dvu
import pandas as pd
import matplotlib.pyplot as plt
import json
from matplotlib.colors import TwoSlopeNorm
from matplotlib.colors import Normalize
import joblib
import viz
from interpret import show

import imodels
from interpret.glassbox import ExplainableBoostingClassifier, ExplainableBoostingRegressor

from sklearn.base import RegressorMixin, ClassifierMixin
from imodels.algebraic.gam_multitask import MultiTaskGAMRegressor

In [None]:
html = pd.read_html('https://www.uco.es/kdis/mllresources/#EnronDesc')

In [63]:
# drop last column
df = html[0]
df = df.iloc[:, :-1]

# convert multiindex to single index
df.columns = [col[0] for col in df.columns.values]

df.to_csv('multitask.csv')

In [56]:
from scipy.io import arff
import xml.etree.ElementTree as ET
import xmltodict

# convert mulan format dataset to csv
mulan_arff = '../data/multitask/Birds_Mulan/Birds.arff'
mulan_xml = '../data/multitask/Birds_Mulan/Birds.xml'  # contains target labels

# convert to csv
data, meta = arff.loadarff(mulan_arff)


# Example usage
with open(mulan_xml, 'r') as file:
    # Parse the XML file into a dictionary
    targets = xmltodict.parse(file.read())
targets = [d['@name'] for d in [targets['labels']['label']][0]]

df = pd.DataFrame(data)
for target in targets:
    assert target in df.columns

# append __target to each target column
df.columns = [
    f'{col}__target' if col in targets else col for col in df.columns]

In [57]:
df

Unnamed: 0,audio-ssd1,audio-ssd2,audio-ssd3,audio-ssd4,audio-ssd5,audio-ssd6,audio-ssd7,audio-ssd8,audio-ssd9,audio-ssd10,...,HermitWarbler__target,SwainsonsThrush__target,HammondsFlycatcher__target,WesternTanager__target,BlackHeadedGrosbeak__target,GoldenCrownedKinglet__target,WarblingVireo__target,MacGillivraysWarbler__target,StellarsJay__target,CommonNighthawk__target
0,0.132445,0.143931,0.227729,0.298556,0.385907,0.378363,0.354708,0.384165,0.360092,0.347465,...,b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0'
1,0.101617,0.130342,0.228117,0.281017,0.365804,0.370122,0.359235,0.388608,0.362013,0.348229,...,b'0',b'0',b'1',b'0',b'0',b'0',b'0',b'0',b'0',b'0'
2,0.005148,0.017877,0.042137,0.062124,0.097340,0.088305,0.084337,0.083204,0.074532,0.071497,...,b'0',b'1',b'1',b'1',b'0',b'0',b'0',b'0',b'0',b'0'
3,0.018792,0.012898,0.027330,0.039521,0.064671,0.068329,0.065799,0.059891,0.048287,0.047820,...,b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0'
4,0.007008,0.014610,0.033637,0.042604,0.065649,0.065047,0.064553,0.058155,0.048516,0.047021,...,b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640,0.009148,0.009075,0.015139,0.020908,0.037890,0.036355,0.038220,0.044481,0.041390,0.044327,...,b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0'
641,0.025508,0.011626,0.023700,0.030874,0.047864,0.040987,0.041066,0.045088,0.040161,0.044836,...,b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0'
642,0.332050,0.053668,0.123005,0.142725,0.178769,0.165174,0.161457,0.162597,0.124231,0.130416,...,b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0'
643,0.009871,0.014142,0.030270,0.044325,0.065054,0.060812,0.062368,0.061929,0.055983,0.057395,...,b'0',b'1',b'0',b'0',b'0',b'1',b'0',b'0',b'0',b'0'
