In [1]:
from openml import datasets, tasks, runs, flows, setups, config
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import sys

config.apikey = '8e2079508005379adb9a60798696134e'
config.server = 'https://www.openml.org/api/v1'
config.set_cache_directory(os.path.expanduser('~/openml-cache'))

In [2]:
# select and filter flows made by scikit learn
flowsJson = flows.list_flows()
flowList = pd.DataFrame.from_dict(flowsJson, orient='index')[['id','name']]
pipelines = flowList.loc[flowList.name.str.startswith('sklearn.pipeline')]

In [3]:
# parse pipelines and find valid pipelines 
pipelines = pipelines.assign(parsed = pipelines.name.map(lambda x: x[x.find('(') + 1:x.find(')') + 1]).map(lambda s : s.split(',')))
pipelines = pipelines.assign(p_length = pipelines.parsed.map(lambda p: len(p)))
pipelines = pipelines.query('p_length > 1')
#pipelines.drop(['name'], axis = 1, inplace=True)

In [4]:
# remove step names and extra punctuations
pipelines.parsed = pipelines.parsed.map(lambda pi : map(lambda p: p[p.find('=') + 1 :], pi))
pipelines.parsed = pipelines.parsed.map(lambda pi : map(lambda p: p.strip('()'), pi))
flow_ids = pipelines.id

In [5]:
def appendRun(df, nextBatch):
    nb = pd.DataFrame.from_dict(nextBatch, orient='index')
    return (pd.concat([df,nb]))

In [6]:
TRAINING_RUNS_LOC = '../../data/training-runs.csv'
def readOrDownloadRuns():
    if (os.path.exists(TRAINING_RUNS_LOC)):
        trainingRuns = pd.read_csv(TRAINING_RUNS_LOC)
        return (trainingRuns)
    else:
        size = 10000
        offset = 0
        rl = runs.list_runs(flow=flow_ids, size = size, offset = offset)
        experiments = pd.DataFrame.from_dict(rl, orient='index')
        try:
            while(0 < 1):
                offset = offset + size
                rl = runs.list_runs(flow=flow_ids, size = size, offset = offset)
                experiments = appendRun(experiments, rl)
        except Exception:
            print('finished reading')
        trainingRuns = experiments.groupby(['setup_id', 'flow_id','task_id']).size().reset_index(name='counts')
        trainingRuns.to_csv(TRAINING_RUNS_LOC, index=False)
        return (trainingRuns)

trainingRuns = readOrDownloadRuns()


In [58]:
trainingRuns.head()

5432    5432
5505    5505
5507    5507
5520    5520
5591    5591
5648    5648
5653    5653
5659    5659
5662    5662
5663    5663
5668    5668
5703    5703
5730    5730
5732    5732
5733    5733
5736    5736
5738    5738
5739    5739
5740    5740
5741    5741
5742    5742
5743    5743
5744    5744
5745    5745
5747    5747
5749    5749
5752    5752
5753    5753
5755    5755
5757    5757
        ... 
7659    7659
7665    7665
7668    7668
7671    7671
7682    7682
7688    7688
7694    7694
7695    7695
7699    7699
7701    7701
7703    7703
7705    7705
7706    7706
7707    7707
7711    7711
7717    7717
7720    7720
7723    7723
7726    7726
7728    7728
7730    7730
7732    7732
7734    7734
7736    7736
7738    7738
7740    7740
7742    7742
7744    7744
7746    7746
7748    7748
Name: id, Length: 293, dtype: int64

In [8]:
taskIds = trainingRuns.task_id.unique()
flowIds = trainingRuns.flow_id.unique()
setupIds = trainingRuns.setup_id.unique()
pipelines = pipelines.reset_index(drop=True)

In [24]:
MAPPING_LOC = '../../data/mapping.csv'
def readOrDownloadMapping():
    if (os.path.exists(MAPPING_LOC)):
        mapping_table = pd.read_csv(MAPPING_LOC)
        return (mapping_table)
    else:
        taskList = []
        for t in taskIds:
            #print 'retreiving task: ' + str(t)
            try:
                task = tasks.get_tasks(task_ids=[t])
                taskList.append(task)
            except Exception:
                print 'error in task: ' + str (t)
        datasetIds = []
        mapping = dict()
        for t in taskList:
            mapping[t[0].task_id] = (t[0].dataset_id) 

        # mapping table to change the task id in Pipeline table to dataset
        mapping_table = pd.DataFrame.from_dict(mapping,orient='index')
        mapping_table['task'] = mapping_table.index
        mapping_table = mapping_table.rename(columns = {0:'dataset'})
        mapping_table.to_csv(MAPPING_LOC, index=False)
    return (mapping_table)
    
Mapping = readOrDownloadMapping()

error in task: 3
error in task: 20
error in task: 21
error in task: 26
error in task: 45
error in task: 49
error in task: 206
error in task: 233
error in task: 250
error in task: 251
error in task: 256
error in task: 24
error in task: 41
error in task: 3492
error in task: 3493
error in task: 3494
error in task: 3560
error in task: 34536
error in task: 34537
error in task: 34539
error in task: 7
error in task: 146195


In [25]:
Mapping.head()

Unnamed: 0,dataset,task
1,1,1
2,2,2
4,4,4
5,5,5
6,6,6


## Transformation Table
The Transformation table include the list of all the available transformations.
The current version only support transformation available in scikit-learn
Currently, the table has the following format:

|id            | full_name     |
|:------------ |:--------------|
|id of the Transformation| Full name of the Transformation|

TODO:
- For each transformation, extract the required parameters as well

In [26]:
# Extract all different transformations from the trainingRuns table
# investiage training runs and the pipeline table do not have matching flow (pipeline) ids
# even though the training run is queried from pipeline id column
currentPipelines = pipelines[pipelines.id.isin(flowIds)].reset_index(drop=True)
allTrasformations = []
currentPipelines.parsed.map(lambda ts : [allTrasformations.append(a) for a in ts])
transformations = pd.DataFrame(allTrasformations, columns=['full_name'])
Transformation = pd.DataFrame({'id': transformations.full_name.unique(), 'full_name': transformations.full_name.unique()}, columns = ['id','full_name'])

In [27]:
Transformation.head()

Unnamed: 0,id,full_name
0,sklearn.preprocessing.imputation.Imputer,sklearn.preprocessing.imputation.Imputer
1,sklearn.preprocessing.data.OneHotEncoder,sklearn.preprocessing.data.OneHotEncoder
2,sklearn.ensemble.forest.RandomForestClassifier,sklearn.ensemble.forest.RandomForestClassifier
3,openml.utils.preprocessing.ConditionalImputer,openml.utils.preprocessing.ConditionalImputer
4,sklearn.feature_selection.variance_threshold.V...,sklearn.feature_selection.variance_threshold.V...


## Pipeline Table
The Pipeline table contains information about the pipeline, consists of the model, the transformations, the dataset the pipeline is trained on, and parameters. 
The current version only support transformation available in scikit-learn Currently, the table has the following format:

|id            | full_name     | model | transformations | hyperparameters | dataset |
|:------------ |:--------------|:------|:----------------|:----------------|---------|
|id | Full name| List of Transformations| learned model|hyper parameters of the transformations and the model| dataset|

TODO:
- Extract model
- Extract hyperparameters
- There are multiple setup ids for some combinatino of <flow,task>. Make sure that only the parameter values are different and not the actual parameter names

In [28]:
# Extract the pipeline info from the run and flow tables
currentPipelines = pipelines[pipelines.id.isin(flowIds)].reset_index(drop=True)
# Check the setup ids later. for now just ignore setup_id 
before = currentPipelines.merge(trainingRuns, left_on='id', right_on='flow_id')[['id','name','task_id','parsed']]
dedulicated = before.drop_duplicates(subset=['id','task_id'])
pipeline_table = dedulicated.rename(columns = {"name":"full_name", "parsed":"transformations", "task_id":"task"})[['id','full_name','transformations','task']]
Pipeline = pipeline_table.merge(Mapping, on='task', how='inner')[['id','full_name','transformations','dataset']]

In [29]:
Pipeline.head()

Unnamed: 0,id,full_name,transformations,dataset
0,5591,sklearn.pipeline.Pipeline(Imputer=sklearn.prep...,"[sklearn.preprocessing.imputation.Imputer, skl...",61
1,5804,sklearn.pipeline.Pipeline(pca=sklearn.decompos...,"[sklearn.decomposition.pca.PCA, sklearn.ensemb...",61
2,5873,sklearn.pipeline.Pipeline(Imputer=openml.utils...,[openml.utils.preprocessing.ConditionalImputer...,61
3,5983,sklearn.pipeline.Pipeline(dualimputer=helper.d...,"[helper.dual_imputer.DualImputer, sklearn.prep...",61
4,6038,sklearn.pipeline.Pipeline(dualimputer=helper.d...,"[helper.dual_imputer.DualImputer, sklearn.neig...",61


## Parameter Table
The parameter table contains a list of all the parameters, (their type information), and their default values.
The current version only support transformation available in scikit-learn Currently, the table has the following format:

|id            | name     | full_name | default_value | type |
|:------------ |:--------------|:------|:----------------|:----------------|
|id |  name| Full Name| Default value|Type information|

TODO:
- Ensure that all that every setup for a specific flow has the exact same set of parameters


In [49]:
PARAMETER_LOC = '../../data/parameter.csv'
def readOrDownloadParams():
    if (os.path.exists(PARAMETER_LOC)):
        parameter_table = pd.read_csv(PARAMETER_LOC)
        return (parameter_table)
    else:
        all_setups = Pipeline.merge(trainingRuns, left_on='id', right_on='flow_id')[['id','setup_id']]
        # have to check these to make sure that every setup for a flow contains exactly the same set of parameters
        unique_parameters = all_setups.drop_duplicates(subset=['id'])
        all_setups = setups.list_setups(setup=unique_parameters.setup_id)
        parameter_table = pd.DataFrame(columns=['id','name','full_name','default_value','type'])
        for sk, sv in all_setups.iteritems():
            for pk, pv in sv.parameters.iteritems():
                parameter_table.loc[parameter_table.shape[0]] = [pv.id,pv.parameter_name,pv.full_name,pv.default_value,pv.data_type]
        parameter_table.to_csv(PARAMETER_LOC, index=False)
        return (parameter_table)
Parameter = readOrDownloadParams()

In [52]:
Parameter.head()

Unnamed: 0,id,name,full_name,default_value,type
0,53952,iterated_power,sklearn.decomposition.pca.PCA(1)_iterated_power,"""auto""",
1,53953,n_components,sklearn.decomposition.pca.PCA(1)_n_components,,
2,53954,random_state,sklearn.decomposition.pca.PCA(1)_random_state,,
3,53955,svd_solver,sklearn.decomposition.pca.PCA(1)_svd_solver,"""auto""",
4,53956,tol,sklearn.decomposition.pca.PCA(1)_tol,0.0,


## Dataset Table
The Dataset table contains metadata about the existing datasets in the training runs.
First from the pipeline table, the tasks are extracted, then the correspoding dataset is retreived from OpenML.
The table has the following schema:

|id            | name     | NumberOfClasses | NumberOfFeatures | NumberOfInstances |
|:------------ |:--------------|:------|:----------------|:----------------|
|id |  name| Number of classes| Number of Features |Number of training instances|

TODO:
- OpenML contains more information about the Datasets, check if any other information are needed

In [54]:
DATASET_LOC = '../../data/dataset.csv'
def readOrDownloadDatasets():
    if (os.path.exists(DATASET_LOC)):
        dataset_table = pd.read_csv(DATASET_LOC)
        return (dataset_table)
    else:
        ds = datasets.list_datasets()
        datasetTable = pd.DataFrame.from_dict(ds, orient='index')
        existingDatasets = datasetTable[datasetTable.did.isin(Pipeline.dataset.unique())]
        dataset_table = existingDatasets.rename(columns = {"did":"id"})[['id','name', 'NumberOfClasses','NumberOfFeatures','NumberOfInstances']]
        dataset_table.to_csv(DATASET_LOC, index=False)
        return (dataset_table)
Dataset = readOrDownloadDatasets()

In [57]:
Dataset.head()

Unnamed: 0,id,name,NumberOfClasses,NumberOfFeatures,NumberOfInstances
2,2,anneal,5,39,898
4,4,labor,2,17,57
5,5,arrhythmia,13,280,452
6,6,letter,26,17,20000
10,10,lymph,4,19,148


## Training Run Table
The Training run table contains information about all the single execution of pipelines on different datasets, and the parameters setup used for those. Currently, we do not have information about the run time of each execution.
