In [1]:
from openml import datasets, tasks, runs, flows, setups, config
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import sys

config.apikey = '8e2079508005379adb9a60798696134e'
config.server = 'https://www.openml.org/api/v1'
config.set_cache_directory(os.path.expanduser('~/openml-cache'))

In [19]:
# select and filter flows made by scikit learn
flowsJson = flows.list_flows()
flowList = pd.DataFrame.from_dict(flowsJson, orient='index')[['id','name']]
pipelines = flowList.loc[flowList.name.str.startswith('sklearn.pipeline')]

In [20]:
# parse pipelines and find valid pipelines 
pipelines = pipelines.assign(parsed = pipelines.name.map(lambda x: x[x.find('(') + 1:x.find(')') + 1]).map(lambda s : s.split(',')))
pipelines = pipelines.assign(p_length = pipelines.parsed.map(lambda p: len(p)))
pipelines = pipelines.query('p_length > 1')
#pipelines.drop(['name'], axis = 1, inplace=True)

In [21]:
# remove step names and extra punctuations
pipelines.parsed = pipelines.parsed.map(lambda pi : map(lambda p: p[p.find('=') + 1 :], pi))
pipelines.parsed = pipelines.parsed.map(lambda pi : map(lambda p: p.strip('()'), pi))
flow_ids = pipelines.id

In [22]:
def appendRun(df, nextBatch):
    nb = pd.DataFrame.from_dict(nextBatch, orient='index')
    return (pd.concat([df,nb]))

In [23]:
TRAINING_RUNS_LOC = '../../data/training-runs.csv'
def readOrDownloadRuns():
    if (os.path.exists(TRAINING_RUNS_LOC)):
        trainingRuns = pd.read_csv(TRAINING_RUNS_LOC)
        return (trainingRuns)
    else:
        size = 10000
        offset = 0
        rl = runs.list_runs(flow=flow_ids, size = size, offset = offset)
        experiments = pd.DataFrame.from_dict(rl, orient='index')
        try:
            while(0 < 1):
                offset = offset + size
                rl = runs.list_runs(flow=flow_ids, size = size, offset = offset)
                experiments = appendRun(experiments, rl)
        except Exception:
            print('finished reading')
        trainingRuns = experiments.groupby(['setup_id', 'flow_id','task_id']).size().reset_index(name='counts')
        trainingRuns.to_csv(TRAINING_RUNS_LOC, index=False)
        return (trainingRuns)

trainingRuns = readOrDownloadRuns()


In [24]:
trainingRuns.head()

Unnamed: 0,setup_id,flow_id,task_id,counts
0,29246,5591,59,23
1,29288,5653,145677,1
2,29293,5662,145677,1
3,29294,5663,145677,1
4,30231,5743,145677,1


In [25]:
taskIds = trainingRuns.task_id.unique()
flowIds = trainingRuns.flow_id.unique()
setupIds = trainingRuns.setup_id.unique()

In [26]:
pipelines = pipelines.reset_index(drop=True)

## Transformation Table
The Transformation table include the list of all the available transformations.
The current version only support transformation available in scikit-learn
Currently, the table has the following format:

|id            | full_name     |
|:------------ |:--------------|
|id of the Transformation| Full name of the Transformation|

TODO:
- For each transformation, extract the required parameters as well

In [10]:
# Extract all different transformations from the trainingRuns table
# investiage training runs and the pipeline table do not have matching flow (pipeline) ids
# even though the training run is queried from pipeline id column
currentPipelines = pipelines[pipelines.id.isin(flowIds)].reset_index(drop=True)
allTrasformations = []
currentPipelines.parsed.map(lambda ts : [allTrasformations.append(a) for a in ts])
transformations = pd.DataFrame(allTrasformations, columns=['full_name'])
Transformation = pd.DataFrame({'id': transformations.full_name.unique(), 'full_name': transformations.full_name.unique()}, columns = ['id','full_name'])

In [27]:
pipelines.head()

Unnamed: 0,id,name,parsed,p_length
0,5432,sklearn.pipeline.Pipeline(sklearn.preprocessin...,"[sklearn.preprocessing.imputation.Imputer, skl...",3
1,5505,sklearn.pipeline.Pipeline(steps__scale=sklearn...,"[sklearn.preprocessing.data.RobustScaler, skle...",2
2,5507,sklearn.pipeline.Pipeline(steps__imputer=sklea...,"[sklearn.preprocessing.imputation.Imputer, skl...",4
3,5520,sklearn.pipeline.Pipeline(steps__Imputer=sklea...,"[sklearn.preprocessing.imputation.Imputer, skl...",3
4,5591,sklearn.pipeline.Pipeline(Imputer=sklearn.prep...,"[sklearn.preprocessing.imputation.Imputer, skl...",3


## Pipeline Table
The Pipeline table contains information about the pipeline, consists of the model, the transformations, the dataset the pipeline is trained on, and parameters. 
The current version only support transformation available in scikit-learn Currently, the table has the following format:

|id            | full_name     | model | transformations | hyperparameters | dataset |
|:------------ |:--------------|:------|:----------------|:----------------|---------|
|id | Full name| List of Transformations| learned model|hyper parameters of the transformations and the model| dataset|

TODO:
- Extract model
- Extract hyperparameters
- There are multiple setup ids for some combinatino of <flow,task>. Make sure that only the parameter values are different and not the actual parameter names

In [44]:
# Extract the pipeline info from the run and flow tables
currentPipelines = pipelines[pipelines.id.isin(flowIds)].reset_index(drop=True)
# Check the setup ids later. for now just ignore setup_id 
joined = currentPipelines.merge(trainingRuns, left_on='id', right_on='flow_id')[['id','name','task_id','parsed']]
dedulicated = before.drop_duplicates(subset=['id','task_id'])
Pipeline = dedulicated.rename(columns = {"name":"full_name", "parsed":"transformations", "task_id":"dataset"})[['id','full_name','transformations','dataset']]

In [46]:
Pipeline.head()

Unnamed: 0,id,full_name,transformations,dataset
0,5591,sklearn.pipeline.Pipeline(Imputer=sklearn.prep...,"[sklearn.preprocessing.imputation.Imputer, skl...",59
2,5648,sklearn.pipeline.Pipeline(Imputer=openml.utils...,[openml.utils.preprocessing.ConditionalImputer...,29
3,5653,sklearn.pipeline.Pipeline(feature_select=sklea...,[sklearn.feature_selection.univariate_selectio...,145677
4,5662,sklearn.pipeline.Pipeline(scaling=sklearn.prep...,"[sklearn.preprocessing.data.StandardScaler, sk...",145677
5,5663,sklearn.pipeline.Pipeline(scaling=sklearn.prep...,"[sklearn.preprocessing.data.StandardScaler, sk...",145677


## Parameter Table
The parameter table contains a list of all the parameters, (their type information), and their default values.
The current version only support transformation available in scikit-learn Currently, the table has the following format:

|id            | name     | full_name | default_value | type |
|:------------ |:--------------|:------|:----------------|:----------------|
|id |  name| Full Name| Default value|Type information|

TODO:
- Ensure that all that every setup for a specific flow has the exact same set of parameters


In [203]:
all_setups = Pipeline.merge(trainingRuns, left_on='id', right_on='flow_id')[['id','setup_id']]
# have to check these to make sure that every setup for a flow contains exactly the same set of parameters
unique_parameters = all_setups.drop_duplicates(subset=['id'])
all_setups = setups.list_setups(setup=unique_parameters.setup_id)
Parameter = pd.DataFrame(columns=['id','name','full_name','default_value','type'])
for sk, sv in all_setups.iteritems():
    for pk, pv in sv.parameters.iteritems():
        df.loc[df.shape[0]] = [pv.id,pv.parameter_name,pv.full_name,pv.default_value,pv.data_type]

In [207]:
Parameter.head()

Unnamed: 0,id,name,full_name,default_value,type
0,53952,iterated_power,sklearn.decomposition.pca.PCA(1)_iterated_power,"""auto""",
1,53953,n_components,sklearn.decomposition.pca.PCA(1)_n_components,,
2,53954,random_state,sklearn.decomposition.pca.PCA(1)_random_state,,
3,53955,svd_solver,sklearn.decomposition.pca.PCA(1)_svd_solver,"""auto""",
4,53956,tol,sklearn.decomposition.pca.PCA(1)_tol,0.0,


## Dataset Table

In [210]:
ds = datasets.list_datasets()
datasetTable = pd.DataFrame.from_dict(ds, orient='index')

In [216]:
relatedTasks = Pipeline.dataset.unique()

In [223]:
relatedTasks

array([    59,     29, 145677,   9893,     58,   3512,   3918,  10093,
         9986,      1,      3,      6,     10,     11,     12,     14,
           16,     18,     20,     21,     22,     23,     26,     28,
           30,     31,     32,     36,     37,     38,     39,     40,
           42,     43,     45,     47,     49,     52,     53,     57,
           60,    206,    219,    231,    233,    236,    240,    241,
          242,    244,    246,    248,    250,    251,    252,    256,
         9983, 125923,   3917,   9980,   3902,  34539,      2,     15,
           24,     41,   2074,   2075,   2079,   3021,   3022,   3481,
         3485,   3492,   3493,   3494,   3510,   3543,   3549,   3560,
         3561,   3567,   3889,   3891,   3896,   3899,   3903,   3904,
         3913,   3946,   3948,   3950,   3954,   7592,   9914,   9946,
         9950,   9952,   9954,   9955,   9956,   9957,   9960,   9964,
         9967,   9968,   9970,   9971,   9976,   9977,   9978,   9979,
      

In [230]:
t = tasks.get_tasks(task_ids=[59])

In [233]:
t.dataset_id

61

In [220]:
relatedTasks

array([    59,     29, 145677,   9893,     58,   3512,   3918,  10093,
         9986,      1,      3,      6,     10,     11,     12,     14,
           16,     18,     20,     21,     22,     23,     26,     28,
           30,     31,     32,     36,     37,     38,     39,     40,
           42,     43,     45,     47,     49,     52,     53,     57,
           60,    206,    219,    231,    233,    236,    240,    241,
          242,    244,    246,    248,    250,    251,    252,    256,
         9983, 125923,   3917,   9980,   3902,  34539,      2,     15,
           24,     41,   2074,   2075,   2079,   3021,   3022,   3481,
         3485,   3492,   3493,   3494,   3510,   3543,   3549,   3560,
         3561,   3567,   3889,   3891,   3896,   3899,   3903,   3904,
         3913,   3946,   3948,   3950,   3954,   7592,   9914,   9946,
         9950,   9952,   9954,   9955,   9956,   9957,   9960,   9964,
         9967,   9968,   9970,   9971,   9976,   9977,   9978,   9979,
      