In [1]:
from openml import datasets, tasks, runs, flows, setups, config
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import sys

config.apikey = '8e2079508005379adb9a60798696134e'
config.server = 'https://www.openml.org/api/v1'
config.set_cache_directory(os.path.expanduser('~/openml-cache'))

In [3]:
# select and filter flows made by scikit learn
flowsJson = flows.list_flows()
flowList = pd.DataFrame.from_dict(flowsJson, orient='index')[['id','name']]

## Pipeline Template
A pipeline template contains information about the created pipelines in the framework. 
It contains an id, name, list of transformations, the model, and the framework the pipeline is desgined in.
The pipeline tempalte does not indicate an actual trained pipeline, it is the architecture of the pipeline.

To Do:
- Extract Model name from the data
- Extract a list of parameter names for the Pipeline Template

In [4]:
pipelines = flowList.loc[flowList.name.str.startswith('sklearn.pipeline')]
pipelines = pipelines.assign(parsed = pipelines.name.map(lambda x: x[x.find('(') + 1:x.find(')') + 1]).map(lambda s : s.split(',')))
pipelines = pipelines.assign(p_length = pipelines.parsed.map(lambda p: len(p)))
pipelines = pipelines.query('p_length > 1')
pipelines.parsed = pipelines.parsed.map(lambda pi : map(lambda p: p[p.find('=') + 1 :], pi))
pipelines.parsed = pipelines.parsed.map(lambda pi : map(lambda p: p.strip('()'), pi))
PipelineTemplate = pipelines[['id','name','parsed']].rename(columns = {'parsed':'transfromations'})
# We are only supporting scikit learn for now
PipelineTemplate['framework']='sklearn'

In [5]:
PipelineTemplate.head()

Unnamed: 0,id,name,transfromations,framework
5432,5432,sklearn.pipeline.Pipeline(sklearn.preprocessin...,"[sklearn.preprocessing.imputation.Imputer, skl...",sklearn
5505,5505,sklearn.pipeline.Pipeline(steps__scale=sklearn...,"[sklearn.preprocessing.data.RobustScaler, skle...",sklearn
5507,5507,sklearn.pipeline.Pipeline(steps__imputer=sklea...,"[sklearn.preprocessing.imputation.Imputer, skl...",sklearn
5520,5520,sklearn.pipeline.Pipeline(steps__Imputer=sklea...,"[sklearn.preprocessing.imputation.Imputer, skl...",sklearn
5591,5591,sklearn.pipeline.Pipeline(Imputer=sklearn.prep...,"[sklearn.preprocessing.imputation.Imputer, skl...",sklearn


## Training Log
Contains a log every execution training.
To simply and reduce size, exact duplicates are marked with a counts parameter at every row.

In [6]:
TRAINING_LOGS_LOC = '../../data/training-logs.csv'
def readOrDownloadLogs():
    if (os.path.exists(TRAINING_LOGS_LOC)):
        training_logs_table = pd.read_csv(TRAINING_LOGS_LOC)
        return (training_logs_table)
    else:
        size = 10000
        offset = 0
        rl = runs.list_runs(flow=flow_ids, size = size, offset = offset)
        experiments = pd.DataFrame.from_dict(rl, orient='index')
        try:
            while(0 < 1):
                offset = offset + size
                rl = runs.list_runs(flow=flow_ids, size = size, offset = offset)
                experiments = appendRun(experiments, rl)
        except Exception:
            print('finished reading')
        training_logs_table = experiments.groupby(['setup_id', 'flow_id','task_id']).size().reset_index(name='counts')
        training_logs_table.to_csv(TRAINING_LOGS_LOC, index=False)
        return (training_logs_table)

TrainingLog = readOrDownloadLogs()

In [7]:
TrainingLog.head()

Unnamed: 0,setup_id,flow_id,task_id,counts
0,29246,5591,59,23
1,29288,5653,145677,1
2,29293,5662,145677,1
3,29294,5663,145677,1
4,30231,5743,145677,1


In [8]:
taskIds = TrainingLog.task_id.unique()
flowIds = TrainingLog.flow_id.unique()
setupIds = TrainingLog.setup_id.unique()

In [9]:
MAPPING_LOC = '../../data/mapping.csv'
def readOrDownloadMapping():
    if (os.path.exists(MAPPING_LOC)):
        mapping_table = pd.read_csv(MAPPING_LOC)
        return (mapping_table)
    else:
        taskList = []
        for t in taskIds:
            #print 'retreiving task: ' + str(t)
            try:
                task = tasks.get_tasks(task_ids=[t])
                taskList.append(task)
            except Exception:
                print 'error in task: ' + str (t)
        datasetIds = []
        mapping = dict()
        for t in taskList:
            mapping[t[0].task_id] = (t[0].dataset_id) 

        # mapping table to change the task id in Pipeline table to dataset
        mapping_table = pd.DataFrame.from_dict(mapping,orient='index')
        mapping_table['task'] = mapping_table.index
        mapping_table = mapping_table.rename(columns = {0:'dataset'})
        mapping_table.to_csv(MAPPING_LOC, index=False)
    return (mapping_table)
    
Mapping = readOrDownloadMapping()

In [10]:
Mapping.head()

Unnamed: 0,dataset,task
0,1,1
1,2,2
2,4,4
3,5,5
4,6,6


## Pipeline Table
A pipeline, as opposed to a pipeline template, is a concrete pipeline that is trained on a specific dataset with a set of hyperparameters (with values) and transformations.
It has an id, id of the pipeline template, a setup id (indicating the list of parameters and their values), and a dataset id.

To Do:
- There still multiple instances of a Pipeline Template and Dataset that have different setup id. We need to investigate whether, different setup ids always indicate different setup of parameter values or it is possible that the parameter values are the same for some of the setup ids. In that case we should remove the duplicate setup ids.

In [11]:
p = TrainingLog[['flow_id','setup_id','task_id']]\
    .merge(Mapping, left_on = 'task_id', right_on ='task')[['flow_id','dataset','setup_id']]\
    .drop_duplicates()
p['id'] = p.apply(lambda i : str(i[0])+str(i[1])+str(i[2]), axis=1)

In [12]:
Pipeline = p.rename(columns={'flow_id':'pipeline_template', 'setup_id':'setup'})[['id','pipeline_template','setup','dataset']]

In [13]:
def pEquals(p1, p2):
    if p1.data_type != p2.data_type or\
    p1.default_value != p2.default_value or\
    p1.flow_id != p2.flow_id or\
    p1.full_name != p2.full_name or\
    p1.id != p2.id or\
    p1.parameter_name != p2.parameter_name or\
    p1.value != p2.value:
        return False
    else:
        return True 
    
def sEquals(s1,s2):
    if len(s1.parameters) != len(s1.parameters):
        return False
    elif s1.parameters.keys() != s2.parameters.keys():
        return False
    else:
        for k in s1.parameters.keys():
            p1 = s1.parameters.get(k)
            p2 = s2.parameters.get(k)
            if not pEquals(p1,p2):
                return False
    
    return True

## Transformation Template Table 
The transformation table includes the information about specific transformations, their parameters, and the framework the belong to. This is table contains the static information about the Transformations.

To Do:
- Extract list of parameters for each Transformation

In [14]:
allTransformations = set()
PipelineTemplate.transfromations.map(lambda ts : [allTransformations.add(a) for a in ts])
Transformation = pd.DataFrame(list(allTransformations), columns=['name'])
Transformation['id'] = Transformation.index
Transformation['framework']='sklearn'
Transformation['parameters'] = ""
Transformation = Transformation[['id','name','parameters','framework']]

In [15]:
Transformation.head()

Unnamed: 0,id,name,parameters,framework
0,0,__main__.LightClassifier,,sklearn
1,1,sklearn.ensemble.voting_classifier.VotingClass...,,sklearn
2,2,sklearn.ensemble.voting_classifier.VotingClass...,,sklearn
3,3,sklearn.neural_network.multilayer_perceptron.M...,,sklearn
4,4,sklearn.ensemble.voting_classifier.VotingClass...,,sklearn


## Parameter Table
The parameter table contains a list of all the parameters, (their type information), and their default values.
The current version only support transformation available in scikit-learn Currently, the table has the following format:

|id            | name     | full_name | default_value | type |
|:------------ |:--------------|:------|:----------------|:----------------|
|id |  name| Full Name| Default value|Type information|

TODO:
- Ensure that all that every setup for a specific flow has the exact same set of parameters


In [16]:
PARAMETER_LOC = '../../data/parameter.csv'
def readOrDownloadParams():
    if (os.path.exists(PARAMETER_LOC)):
        parameter_table = pd.read_csv(PARAMETER_LOC)
        return (parameter_table)
    else:
        # have to check these to make sure that every setup for a flow contains exactly the same set of parameters
        all_setups = Pipeline[['pipeline_template','setup']]
        unique_parameters = all_setups.drop_duplicates(subset=['pipeline_template'])
        all_setups = setups.list_setups(setup=unique_parameters.setup)
        parameter_table = pd.DataFrame(columns=['id','name','full_name','default_value','type'])
        for sk, sv in all_setups.iteritems():
            for pk, pv in sv.parameters.iteritems():
                parameter_table.loc[parameter_table.shape[0]] = [pv.id,pv.parameter_name,pv.full_name,pv.default_value,pv.data_type]
        parameter_table.to_csv(PARAMETER_LOC, index=False)
        return (parameter_table)
    
Parameter = readOrDownloadParams()

In [17]:
Parameter.head()

Unnamed: 0,id,name,full_name,default_value,type
0,53952,iterated_power,sklearn.decomposition.pca.PCA(1)_iterated_power,"""auto""",
1,53953,n_components,sklearn.decomposition.pca.PCA(1)_n_components,,
2,53954,random_state,sklearn.decomposition.pca.PCA(1)_random_state,,
3,53955,svd_solver,sklearn.decomposition.pca.PCA(1)_svd_solver,"""auto""",
4,53956,tol,sklearn.decomposition.pca.PCA(1)_tol,0.0,


## Dataset Table
The Dataset table contains metadata about the existing datasets in the training runs.
First from the pipeline table, the tasks are extracted, then the correspoding dataset is retreived from OpenML.
The table has the following schema:

|id            | name     | NumberOfClasses | NumberOfFeatures | NumberOfInstances |
|:------------ |:--------------|:------|:----------------|:----------------|
|id |  name| Number of classes| Number of Features |Number of training instances|

TODO:
- OpenML contains more information about the Datasets, check if any other information are needed

In [18]:
DATASET_LOC = '../../data/dataset.csv'
def readOrDownloadDatasets():
    if (os.path.exists(DATASET_LOC)):
        dataset_table = pd.read_csv(DATASET_LOC)
        return (dataset_table)
    else:
        ds = datasets.list_datasets()
        datasetTable = pd.DataFrame.from_dict(ds, orient='index')
        existingDatasets = datasetTable[datasetTable.did.isin(Mapping.dataset.unique())]
        dataset_table = existingDatasets.rename(columns = {"did":"id"})[['id','name', 'NumberOfClasses','NumberOfFeatures','NumberOfInstances']]
        dataset_table.to_csv(DATASET_LOC, index=False)
        return (dataset_table)
Dataset = readOrDownloadDatasets()

In [20]:
Dataset.head()

Unnamed: 0,id,name,NumberOfClasses,NumberOfFeatures,NumberOfInstances
0,2,anneal,5,39,898
1,4,labor,2,17,57
2,5,arrhythmia,13,280,452
3,6,letter,26,17,20000
4,10,lymph,4,19,148


## Setup Table
An intermediary table that comes between the Pipeline and the Parameter table. Each entry in the Setup Id is mapped to on entry in the Pipeline table and includes a list of all the parameters and their (run time) values for that specific instance of the Pipeline exeuction.

In [None]:
SETUP_LOC = '../../data/setups.csv'
def readOrDownloadSetups():
    if (os.path.exists(SETUP_LOC)):
        setup_table = pd.read_csv(SETUP_LOC)
        return (setup_table)
    setup_table = pd.DataFrame(columns=['id','pipeline','parameters'])
    size = 100
    offset = 0
    setup_dict = setups.list_setups(setup = setupIds[offset:(offset+size)])
    for s in setup_dict.values():
        params = dict()
        for k,v in s.parameters.iteritems():
            params[k] = v.value
        setup_table.loc[setup_table.shape[0]] = [s.setup_id, s.flow_id, params]
    try:
        while(0 < 1):
            offset = offset + size
            setup_dict = setups.list_setups(setup = setupIds[offset:(offset+size)])
            for s in setup_dict.values():
                params = dict()
                for k,v in s.parameters.iteritems():
                    params[k] = v.value
                setup_table.loc[setup_table.shape[0]] = [s.setup_id, s.flow_id, params]
    except Exception:
        print('finished reading')
    setup_table.to_csv(SETUP_LOC, index=False)
    return (setup_table)

Setup = readOrDownloadSetups()

In [None]:
Setup.head()

## Deduplication of Setup ids
Here we attempt to deduplicate any configuration setup (for same pipeline) that has exactly the same parameter values.

In [226]:
# A setup configuration cannot belong to multiple pipelines
assert Setup.shape[0] == len(Setup.id.unique())

In [227]:
# Make sure every pipeline has exactly the same set of parameters keys(values are obviously are not always the same)
a = Setup.groupby('pipeline').apply(lambda x: (len(x.parameters.keys()) - len(set(x.parameters.keys())))).reset_index(name = 'size')
assert a['size'].max() == 0