In [1]:
import openml
import openml as oml
import numpy as np
import pandas as pd
from datetime import datetime
import os
import re

In [2]:
class SearchSpace:
    
    def filter_datasets(self):
        df = openml.tasks.list_tasks(number_instances='100..8000000', number_features='4..100',
                                     number_classes='2..100', number_missing_values='0..0', output_format='dataframe')
        print(df)
        #print(len(df))
        return df

    def filter_tasks_flows(self, df):
        filt = df[(df['task_type'] == 'Supervised Classification') & 
                  (df['evaluation_measures'] == 'predictive_accuracy') & 
                  (df['estimation_procedure'] == '10-fold Crossvalidation')]['tid'].to_numpy()
        task_df = openml.evaluations.list_evaluations(function='predictive_accuracy', task=filt, output_format='dataframe')
        print(task_df)
        task_df = task_df[task_df['flow_name'].str.contains("sklearn")]
        print(len(task_df))
        return task_df

    def filter_dataset_top_5(self, task_df):
        list_array = []
        datasets = task_df.data_name
        datasets.value_counts
        value_data_name = np.unique(np.array(pd.Series(datasets)))    
        aux_value = len(value_data_name)
       
        for i in range(len(value_data_name)):
            pd_data_task_df = task_df[task_df['data_name'].str.contains(value_data_name[i])]
            best_five = pd_data_task_df.nlargest(5, 'value')
            data_task_id = np.unique(np.array(best_five.task_id))
           

            for task_id in data_task_id:
                x_var_task_id = best_five[(best_five['task_id'] == task_id)]
                x_var_flow_id = np.unique(np.array(x_var_task_id['flow_id']))
               

                for flow_id in x_var_flow_id:
                    x_var_run_id = best_five[(best_five['task_id'] == task_id) & (best_five['flow_id'] == flow_id)]
                    x_run_id = np.unique(np.array(x_var_run_id['run_id']))
                    for run_id in x_run_id:
                        print(value_data_name[i], flow_id, run_id)

                        
                        df = openml.evaluations.list_evaluations_setups(
                        function="predictive_accuracy",
                        flow=[flow_id],
                        task=[task_id],
                        run=[run_id],
                        output_format="dataframe",
                        parameters_in_separate_columns=False,
                       )

                        list_array.append(df)            
                        now_date = datetime.now()
                        dt_string = now_date.strftime("%d/%m/%Y %H:%M:%S")
                        print("date and time =", dt_string)
                    print("task_id = ", task_id)
           
        df_ = pd.concat(list_array,join='inner').sort_values(by=['data_name','value'], ascending=False)
        return df_

    '''def best_result(self, group):
        best_results = group.nlargest(1, 'value')
        if best_results['flow_name'].iloc[0] in self.top_5.index:
            return best_results'''
    
    def filter_datasets_flow_top(self, df_, n_flows):
        df_['flow_name'] =  df_['flow_name'].apply(lambda x: re.sub(r'\(\d+\)$', '', x))
        value_counts = df_['flow_name'].value_counts()
        print(value_counts)
        self.top_5 = value_counts.nlargest(n_flows) #number flows
        best_results = []
        for data_name in df_['data_name'].unique():
            df_data = df_[df_['data_name'] == data_name]
            best_result = df_data.nlargest(1, 'value')

            if best_result['flow_name'].iloc[0] in self.top_5.index:
                best_results.append(best_result)

        if len(best_results) > 0:
            best_results = pd.concat(best_results, ignore_index=True)
            best_results.to_csv("best_results.csv", sep=';')
            print(best_results)
        else:
            print("No best results were found.")

In [3]:
search_space = SearchSpace()
df = search_space.filter_datasets()

          tid  ttid    did                   name                  task_type  \
0           3     1      3               kr-vs-kp  Supervised Classification   
1           6     1      6                 letter  Supervised Classification   
2          10     1     10                  lymph  Supervised Classification   
3          11     1     11          balance-scale  Supervised Classification   
4          14     1     14          mfeat-fourier  Supervised Classification   
...       ...   ...    ...                    ...                        ...   
16374  361970     2  45665                  colon      Supervised Regression   
16375  361974     2  45669                 breast      Supervised Regression   
16376  361978     2  45693  simulated_electricity      Supervised Regression   
16377  361983     1  45711               doa_bwin  Supervised Classification   
16378  361984     1  45712      doa_bwin_balanced  Supervised Classification   

       status     estimation_procedure 

In [4]:
task_df = search_space.filter_tasks_flows(df)

          run_id  task_id  setup_id  flow_id  \
0             25       10         9       64   
1             43       10         6       61   
2             74       10         7       62   
3             96       10        18       77   
4            228       10        15       74   
...          ...      ...       ...      ...   
179320  10593805   361113   8305263    19449   
179321  10593806   361114   8305263    19449   
179322  10593807   361115   8305263    19449   
179323  10593808   361116   8305263    19449   
179324  10593809   361127   8305263    19449   

                                                flow_name  data_id  \
0                                      weka.RandomTree(1)       10   
1                                         weka.REPTree(1)       10   
2                                   weka.DecisionStump(1)       10   
3                        weka.LogitBoost_DecisionStump(1)       10   
4                                        weka.Logistic(1)       10   
...

In [5]:
finale_df = search_space.filter_dataset_top_5(task_df)

2dplanes 4834 1838157
date and time = 27/11/2023 15:18:39
task_id =  3593


  pd_data_task_df = task_df[task_df['data_name'].str.contains(value_data_name[i])]


CastMetal1 4835 1838722
date and time = 27/11/2023 15:18:41
task_id =  10096
CastMetal1 7722 9200367
date and time = 27/11/2023 15:18:43
task_id =  10096
CastMetal1 7725 9201552
date and time = 27/11/2023 15:18:45
CastMetal1 7725 9201553
date and time = 27/11/2023 15:18:47
task_id =  10096
CastMetal1 7756 9200955
date and time = 27/11/2023 15:18:48
task_id =  10096
Click_prediction_small 6969 5966372
date and time = 27/11/2023 15:18:50
Click_prediction_small 6969 5983910
date and time = 27/11/2023 15:18:53
Click_prediction_small 6969 5992757
date and time = 27/11/2023 15:18:55
Click_prediction_small 6969 6011408
date and time = 27/11/2023 15:18:57
Click_prediction_small 6969 6011861
date and time = 27/11/2023 15:18:59
task_id =  14971
CostaMadre1 4830 1837506
date and time = 27/11/2023 15:19:00
task_id =  7557
CostaMadre1 4834 1836518
date and time = 27/11/2023 15:19:02
task_id =  7557
CostaMadre1 4835 1838636
date and time = 27/11/2023 15:19:04
task_id =  7557
CovPokElec 4834 1838145


In [6]:
test = search_space.filter_datasets_flow_top(finale_df,5)
test

sklearn.tree.tree.DecisionTreeClassifier                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   308
sklearn.ensemble.forest.RandomForestClassifier                                                                                                                                                                                                                                                                                                                                                           

0
2dplanes 4834 1838157
date and time = 30/05/2023 19:58:06
task_id =  3593
1
BNG(anneal.ORIG) 4834 1837029
date and time = 30/05/2023 19:58:08
task_id =  2136
2
BNG(anneal.ORIG,nominal,1000000) 4834 1836619
date and time = 30/05/2023 19:58:10
task_id =  133
3
BNG(autos) 4834 1837038
date and time = 30/05/2023 19:58:11
task_id =  2139
4
BNG(autos,nominal,1000000) 4834 1836636
date and time = 30/05/2023 19:58:13
task_id =  137
5
BNG(breast-cancer,nominal,1000000) 4834 1836638
date and time = 30/05/2023 19:58:15
task_id =  139
6
BNG(breast-w) 4834 1837054
date and time = 30/05/2023 19:58:17
task_id =  2142
BNG(breast-w) 17477 10437807
date and time = 30/05/2023 19:58:19
task_id =  2142
BNG(breast-w) 17576 10437763
date and time = 30/05/2023 19:58:20
task_id =  2142
BNG(breast-w) 17590 10437793
date and time = 30/05/2023 19:58:22
task_id =  2142
BNG(breast-w) 17593 10437831
date and time = 30/05/2023 19:58:24
task_id =  2142
7
BNG(bridges_version1) 4834 1837425
date and time = 30/05/2023 

Unnamed: 0,run_id,task_id,setup_id,flow_id,flow_name,data_id,data_name,function,upload_time,uploader,uploader_name,value,values,array_data,parameters
0,1837651,2262,24107,4834,sklearn.tree.tree.DecisionTreeClassifier(3),267,BNG(heart-statlog),predictive_accuracy,2016-10-28 16:10:49,1526,Josep Pon Farreny,0.851855,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
0,1837647,2261,24107,4834,sklearn.tree.tree.DecisionTreeClassifier(3),266,BNG(heart-c),predictive_accuracy,2016-10-28 16:06:29,1526,Josep Pon Farreny,0.849577,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
0,1836958,202,24107,4834,sklearn.tree.tree.DecisionTreeClassifier(3),133,"BNG(glass,nominal,137781)",predictive_accuracy,2016-10-28 09:55:11,1526,Josep Pon Farreny,0.691852,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
0,1837646,2260,24107,4834,sklearn.tree.tree.DecisionTreeClassifier(3),265,BNG(glass),predictive_accuracy,2016-10-28 16:01:53,1526,Josep Pon Farreny,0.659525,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
0,1837626,2258,24107,4834,sklearn.tree.tree.DecisionTreeClassifier(3),263,BNG(dermatology),predictive_accuracy,2016-10-28 15:31:23,1526,Josep Pon Farreny,0.975339,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
0,1837625,2257,24107,4834,sklearn.tree.tree.DecisionTreeClassifier(3),262,BNG(cylinder-bands),predictive_accuracy,2016-10-28 15:26:28,1526,Josep Pon Farreny,0.773271,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
0,1837620,2255,24107,4834,sklearn.tree.tree.DecisionTreeClassifier(3),260,BNG(credit-g),predictive_accuracy,2016-10-28 15:05:04,1526,Josep Pon Farreny,0.719121,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
0,1837616,2253,24107,4834,sklearn.tree.tree.DecisionTreeClassifier(3),258,BNG(credit-a),predictive_accuracy,2016-10-28 14:58:04,1526,Josep Pon Farreny,0.849888,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
0,1838146,3017,24107,4834,sklearn.tree.tree.DecisionTreeClassifier(3),256,BNG(colic.ORIG),predictive_accuracy,2016-10-31 10:04:47,1526,Josep Pon Farreny,0.790979,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
0,1837611,2147,24107,4834,sklearn.tree.tree.DecisionTreeClassifier(3),256,BNG(colic.ORIG),predictive_accuracy,2016-10-28 14:46:28,1526,Josep Pon Farreny,0.786859,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...


Unnamed: 0,run_id,task_id,setup_id,flow_id,flow_name,data_id,data_name,function,upload_time,uploader,uploader_name,value,values,array_data,parameters
0,1838157,3593,24107,4834,sklearn.tree.tree.DecisionTreeClassifier,727,2dplanes,predictive_accuracy,2016-10-31 11:27:01,1526,Josep Pon Farreny,0.907746,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
1,1837029,2136,24107,4834,sklearn.tree.tree.DecisionTreeClassifier,245,BNG(anneal.ORIG),predictive_accuracy,2016-10-28 12:13:34,1526,Josep Pon Farreny,0.869905,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
2,1836619,133,24107,4834,sklearn.tree.tree.DecisionTreeClassifier,71,"BNG(anneal.ORIG,nominal,1000000)",predictive_accuracy,2016-10-27 19:12:39,1526,Josep Pon Farreny,0.879188,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
3,1837038,2139,24107,4834,sklearn.tree.tree.DecisionTreeClassifier,248,BNG(autos),predictive_accuracy,2016-10-28 12:34:35,1526,Josep Pon Farreny,0.645372,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
4,1836636,137,24107,4834,sklearn.tree.tree.DecisionTreeClassifier,75,"BNG(autos,nominal,1000000)",predictive_accuracy,2016-10-27 19:28:54,1526,Josep Pon Farreny,0.755157,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
5,1836638,139,24107,4834,sklearn.tree.tree.DecisionTreeClassifier,77,"BNG(breast-cancer,nominal,1000000)",predictive_accuracy,2016-10-27 19:34:47,1526,Josep Pon Farreny,0.778924,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
6,10437807,2142,8259125,17477,sklearn.pipeline.Pipeline(model=sklearn.ensemb...,251,BNG(breast-w),predictive_accuracy,2020-03-03 18:22:33,11497,Fares Gaaloul,0.986054,,,{'sklearn.pipeline.Pipeline(model=sklearn.ense...
7,1837425,2144,24107,4834,sklearn.tree.tree.DecisionTreeClassifier,253,BNG(bridges_version1),predictive_accuracy,2016-10-28 14:10:26,1526,Josep Pon Farreny,0.728416,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
8,10437808,2146,8259125,17477,sklearn.pipeline.Pipeline(model=sklearn.ensemb...,255,BNG(cmc),predictive_accuracy,2020-03-03 18:23:45,11497,Fares Gaaloul,0.564652,,,{'sklearn.pipeline.Pipeline(model=sklearn.ense...
9,1836928,145,24107,4834,sklearn.tree.tree.DecisionTreeClassifier,119,"BNG(cmc,nominal,55296)",predictive_accuracy,2016-10-28 08:56:12,1526,Josep Pon Farreny,0.511339,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...


Unnamed: 0,run_id,task_id,setup_id,flow_id,flow_name,data_id,data_name,function,upload_time,uploader,uploader_name,value,values,array_data,parameters
0,1838157,3593,24107,4834,sklearn.tree.tree.DecisionTreeClassifier,727,2dplanes,predictive_accuracy,2016-10-31 11:27:01,1526,Josep Pon Farreny,0.907746,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
1,1837029,2136,24107,4834,sklearn.tree.tree.DecisionTreeClassifier,245,BNG(anneal.ORIG),predictive_accuracy,2016-10-28 12:13:34,1526,Josep Pon Farreny,0.869905,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
2,1836619,133,24107,4834,sklearn.tree.tree.DecisionTreeClassifier,71,"BNG(anneal.ORIG,nominal,1000000)",predictive_accuracy,2016-10-27 19:12:39,1526,Josep Pon Farreny,0.879188,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
3,1837038,2139,24107,4834,sklearn.tree.tree.DecisionTreeClassifier,248,BNG(autos),predictive_accuracy,2016-10-28 12:34:35,1526,Josep Pon Farreny,0.645372,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
4,1836636,137,24107,4834,sklearn.tree.tree.DecisionTreeClassifier,75,"BNG(autos,nominal,1000000)",predictive_accuracy,2016-10-27 19:28:54,1526,Josep Pon Farreny,0.755157,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
5,1836638,139,24107,4834,sklearn.tree.tree.DecisionTreeClassifier,77,"BNG(breast-cancer,nominal,1000000)",predictive_accuracy,2016-10-27 19:34:47,1526,Josep Pon Farreny,0.778924,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
6,10437807,2142,8259125,17477,sklearn.pipeline.Pipeline(model=sklearn.ensemb...,251,BNG(breast-w),predictive_accuracy,2020-03-03 18:22:33,11497,Fares Gaaloul,0.986054,,,{'sklearn.pipeline.Pipeline(model=sklearn.ense...
7,1837425,2144,24107,4834,sklearn.tree.tree.DecisionTreeClassifier,253,BNG(bridges_version1),predictive_accuracy,2016-10-28 14:10:26,1526,Josep Pon Farreny,0.728416,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
8,10437808,2146,8259125,17477,sklearn.pipeline.Pipeline(model=sklearn.ensemb...,255,BNG(cmc),predictive_accuracy,2020-03-03 18:23:45,11497,Fares Gaaloul,0.564652,,,{'sklearn.pipeline.Pipeline(model=sklearn.ense...
9,1836928,145,24107,4834,sklearn.tree.tree.DecisionTreeClassifier,119,"BNG(cmc,nominal,55296)",predictive_accuracy,2016-10-28 08:56:12,1526,Josep Pon Farreny,0.511339,,,{'sklearn.tree.tree.DecisionTreeClassifier(3)_...
