In [1]:
from openml import datasets, tasks, runs, flows, setups, config
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import sys

config.apikey = '8e2079508005379adb9a60798696134e'
config.server = 'https://www.openml.org/api/v1'
config.set_cache_directory(os.path.expanduser('~/openml-cache'))

In [2]:
# select and filter flows made by scikit learn
flowsJson = flows.list_flows()
flowList = pd.DataFrame.from_dict(flowsJson, orient='index')[['id','name']]
pipelines = flowList.loc[flowList.name.str.startswith('sklearn.pipeline')]

In [3]:
# parse pipelines and find valid pipelines 
pipelines = pipelines.assign(parsed = pipelines.name.map(lambda x: x[x.find('(') + 1:x.find(')') + 1]).map(lambda s : s.split(',')))
pipelines = pipelines.assign(p_length = pipelines.parsed.map(lambda p: len(p)))
pipelines = pipelines.query('p_length > 1')
pipelines.drop(['name'], axis = 1, inplace=True)

In [4]:
# remove step names and extra punctuations
pipelines.parsed = pipelines.parsed.map(lambda pi : map(lambda p: p[p.find('=') + 1 :], pi))
pipelines.parsed = pipelines.parsed.map(lambda pi : map(lambda p: p.strip('()'), pi))
flow_ids = pipelines.id

In [5]:
def appendRun(df, nextBatch):
    nb = pd.DataFrame.from_dict(nextBatch, orient='index')
    return (pd.concat([df,nb]))

In [6]:
TRAINING_RUNS_LOC = '../../data/training-runs.csv'
def readOrDownloadRuns():
    if (os.path.exists(TRAINING_RUNS_LOC)):
        trainingRuns = pd.read_csv(TRAINING_RUNS_LOC)
        return (trainingRuns)
    else:
        size = 10000
        offset = 0
        rl = runs.list_runs(flow=flow_ids, size = size, offset = offset)
        experiments = pd.DataFrame.from_dict(rl, orient='index')
        try:
            while(0 < 1):
                offset = offset + size
                rl = runs.list_runs(flow=flow_ids, size = size, offset = offset)
                experiments = appendRun(experiments, rl)
        except Exception:
            print('finished reading')
        trainingRuns = experiments.groupby(['setup_id', 'flow_id','task_id']).size().reset_index(name='counts')
        trainingRuns.to_csv(TRAINING_RUNS_LOC, index=False)
        return (trainingRuns)

trainingRuns = readOrDownloadRuns()


In [81]:
trainingRuns.head()

Unnamed: 0,setup_id,flow_id,task_id,counts
0,29246,5591,59,23
1,29288,5653,145677,1
2,29293,5662,145677,1
3,29294,5663,145677,1
4,30231,5743,145677,1


In [7]:
taskIds = trainingRuns.task_id.unique()
flowIds = trainingRuns.flow_id.unique()
setupIds = trainingRuns.setup_id.unique()

In [8]:
pipelines = pipelines.reset_index(drop=True)

In [68]:
# Extract all different transformations from the trainingRuns table
# investiage training runs and the pipeline table do not have matching flow (pipeline) ids
# even though the training run is queried from pipeline id column
currentPipelines = pipelines[pipelines.id.isin(flowIds)].reset_index(drop=True)
allTrasformations = []
currentPipelines.parsed.map(lambda ts : [allTrasformations.append(a) for a in ts])
transformations = pd.DataFrame(allTrasformations, columns=['full_name'])
Transformation = pd.DataFrame({'id': transformations.full_name.unique(), 'full_name': transformations.full_name.unique()}, columns = ['id','full_name'])

In [92]:
currentPipelines.head()

Unnamed: 0,id,parsed,p_length
0,5591,"[sklearn.preprocessing.imputation.Imputer, skl...",3
1,5648,[openml.utils.preprocessing.ConditionalImputer...,4
2,5653,[sklearn.feature_selection.univariate_selectio...,2
3,5662,"[sklearn.preprocessing.data.StandardScaler, sk...",3
4,5663,"[sklearn.preprocessing.data.StandardScaler, sk...",3


Unnamed: 0,id,full_name
0,sklearn.preprocessing.imputation.Imputer,sklearn.preprocessing.imputation.Imputer
1,sklearn.preprocessing.data.OneHotEncoder,sklearn.preprocessing.data.OneHotEncoder
2,sklearn.ensemble.forest.RandomForestClassifier,sklearn.ensemble.forest.RandomForestClassifier
3,openml.utils.preprocessing.ConditionalImputer,openml.utils.preprocessing.ConditionalImputer
4,sklearn.feature_selection.variance_threshold.V...,sklearn.feature_selection.variance_threshold.V...


In [113]:
for sk,sv in res.iteritems():
    print 'setup ' + str(sk) + ' :'
    for pk,pv in sv.parameters.iteritems():
        print '\tparam ' + str(pk)  
        #print '\t\tfull name: ' + pv.full_name
        print '\t\tname: ' + pv.parameter_name
        print '\t\tvalue: ' + pv.value

setup 29246 :
	param 50740
		name: bootstrap
		value: True
	param 50741
		name: class_weight
		value: None
	param 50742
		name: criterion
		value: gini
	param 50743
		name: max_depth
		value: None
	param 50744
		name: max_features
		value: auto
	param 50745
		name: max_leaf_nodes
		value: None
	param 50746
		name: min_impurity_split
		value: 1e-07
	param 50747
		name: min_samples_leaf
		value: 1
	param 50748
		name: min_samples_split
		value: 2
	param 50749
		name: min_weight_fraction_leaf
		value: 0.0
	param 50750
		name: n_estimators
		value: 10
	param 50751
		name: n_jobs
		value: 1
	param 50752
		name: oob_score
		value: False
	param 50753
		name: random_state
		value: None
	param 50754
		name: verbose
		value: 0
	param 50755
		name: warm_start
		value: False
	param 51963
		name: steps
		value: [('Imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('OneHotEncoder', OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
     

In [109]:
for sk,sv in res.iteritems():
    print sv.parameters

{50740: <openml.setups.setup.OpenMLParameter object at 0x1118cd290>, 50741: <openml.setups.setup.OpenMLParameter object at 0x1118cd5d0>, 50742: <openml.setups.setup.OpenMLParameter object at 0x1118cd690>, 50743: <openml.setups.setup.OpenMLParameter object at 0x1118cdf90>, 50744: <openml.setups.setup.OpenMLParameter object at 0x1118cdc50>, 50745: <openml.setups.setup.OpenMLParameter object at 0x1118cdc90>, 50746: <openml.setups.setup.OpenMLParameter object at 0x1118cd050>, 50747: <openml.setups.setup.OpenMLParameter object at 0x1118cd210>, 50748: <openml.setups.setup.OpenMLParameter object at 0x1118cd750>, 50749: <openml.setups.setup.OpenMLParameter object at 0x1118cda10>, 50750: <openml.setups.setup.OpenMLParameter object at 0x1118cd810>, 50751: <openml.setups.setup.OpenMLParameter object at 0x1118cdb50>, 50752: <openml.setups.setup.OpenMLParameter object at 0x1118cd9d0>, 50753: <openml.setups.setup.OpenMLParameter object at 0x1118cd650>, 50754: <openml.setups.setup.OpenMLParameter obj

In [110]:
param = params[50740]

In [102]:
param.parameter_name

u'bootstrap'

In [103]:
param[@full_name]

SyntaxError: invalid syntax (<ipython-input-103-a4c0fe8ce6b5>, line 1)

In [95]:
params

<openml.setups.setup.OpenMLParameter at 0x108ad7750>