In [17]:
import numpy as np
import pandas as pd

import xml.etree.ElementTree as ET
import os

In [2]:
DATA_PATH = os.path.expanduser('~/Documents/datasets/mulan')
DATASETS = ['birds-test',
            'birds-train',
            'CAL500',
            'emotions',
            'emotions-test',
            'emotions-train',
            'mediamill',
            'mediamill-test',
            'mediamill-train',
            'yeast',
            'yeast-test',
            'yeast-train']
CLASSES = ['birds', 'CAL500', 'emotions', 'mediamill', 'yeast']

In [3]:
classes = {}
for cls in CLASSES:
    xmlfile = os.path.join(DATA_PATH, cls + '.xml')
    tree = ET.parse(xmlfile)
    root = tree.getroot()
    classes[cls] = [child.attrib['name'] for child in root]
classes

{'CAL500': ['Angry-Agressive',
  'NOT-Emotion-Angry-Agressive',
  'Emotion-Arousing-Awakening',
  'NOT-Emotion-Arousing-Awakening',
  'Emotion-Bizarre-Weird',
  'NOT-Emotion-Bizarre-Weird',
  'Emotion-Calming-Soothing',
  'NOT-Emotion-Calming-Soothing',
  'Emotion-Carefree-Lighthearted',
  'NOT-Emotion-Carefree-Lighthearted',
  'Emotion-Cheerful-Festive',
  'NOT-Emotion-Cheerful-Festive',
  'Emotion-Emotional-Passionate',
  'NOT-Emotion-Emotional-Passionate',
  'Emotion-Exciting-Thrilling',
  'NOT-Emotion-Exciting-Thrilling',
  'Emotion-Happy',
  'NOT-Emotion-Happy',
  'Emotion-Laid-back-Mellow',
  'NOT-Emotion-Laid-back-Mellow',
  'Emotion-Light-Playful',
  'NOT-Emotion-Light-Playful',
  'Emotion-Loving-Romantic',
  'NOT-Emotion-Loving-Romantic',
  'Emotion-Pleasant-Comfortable',
  'NOT-Emotion-Pleasant-Comfortable',
  'Emotion-Positive-Optimistic',
  'NOT-Emotion-Positive-Optimistic',
  'Emotion-Powerful-Strong',
  'NOT-Emotion-Powerful-Strong',
  'Emotion-Sad',
  'NOT-Emotion-Sad',


In [4]:
dframes = {}

for ds in DATASETS:
    csvfile = os.path.join(DATA_PATH, ds + '.csv')
    df = pd.read_csv(csvfile, escapechar='\\')
    df.columns = [s.strip("'") for s in df.columns]
    actualset = ds.split('.')[0].split('-')[0]
    assert all(c in df.columns for c in classes[actualset])

    dframes[ds] = df
    data = df.drop(classes[actualset], axis=1)
    labels = df[classes[actualset]]
    
    data.to_csv(os.path.join(DATA_PATH, ds + '_X.csv'), index=False)
    labels.to_csv(os.path.join(DATA_PATH, ds + '_Y.csv'), index=False)

Birds dataset contain only train and test sets, but no concatenated full set.

In [5]:
birds = pd.concat([
            pd.read_csv(os.path.join(DATA_PATH, 'birds-train.csv'), escapechar='\\'),
            pd.read_csv(os.path.join(DATA_PATH, 'birds-test.csv'), escapechar='\\')
        ])
birds.columns = [s.strip("'") for s in birds.columns]
data = birds.drop(classes['birds'], axis=1)
labels = birds[classes['birds']]
dframes['birds'] = birds

data.to_csv(os.path.join(DATA_PATH, 'birds_X.csv'), index=False)
labels.to_csv(os.path.join(DATA_PATH, 'birds_Y.csv'), index=False)