__[Open and try this file online (Colab)](https://colab.research.google.com/github/djekra/bludwig/blob/main/jupyter/40_Train_All_Datasets/_10_Train_All_Datasets.ipynb)__

In [1]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install --quiet blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

blab init
environment['in_colab']     = False
environment['dropbox_path'] = /home/me/Data_Linux/Dropbox
environment['lib_path']     = /home/me/Data_Linux/Dropbox/31_Projekte/01_Python/libs
Start Time: 16:39:13


# Train All Datasets
* Continuable if interrupted
* Cleanup / Restart: Start the notebook with mode='C'

In [2]:
# Settings

mode            = 'T'                      # mode: C=Cleanup, T=Train
use_cache       = True                     # for scan_datasets. The cache is scan_datasets.temp.pickle
rows_max        = 99999999                 # datasets bigger than max rows are ignored

# do_not_scan
very_big        = ['amazon_review_polarity','amazon_reviews','jigsaw_unintended_bias','mercari_price_suggestion', 'yahoo_answers', 'synthetic_fraud']  
very_long       = ['melbourne_airbnb']  
very_bad        = ['goodbooks_books_0']  
error_train     = ['reuters_cmu', 'ohsumed_cmu', 'flickr8k', 'california_house_price', 'goemotions']
other_problems  = ['higgs','alpaca','news_popularity2']  
do_not_scan     = very_big + very_long + very_bad + error_train + other_problems

gdrive_base   = 'drive/MyDrive/Colab'
bgc('Beige')

In [3]:
# Colab? >> Install the latest version of Ludwig
if environment['in_colab']:
  !pip uninstall -y tensorflow --quiet
  !python -m pip install git+https://github.com/ludwig-ai/ludwig.git --quiet 
  !pip install ptitprince --quiet 
  !pip install bludwig --quiet 

In [4]:
# cleanup?
if mode == 'C': 
    try:
        !rm -rf results
        !rm *.meta.json
        !rm *.hdf5    
        !rm *.temp.pickle        
        !rm .lock_preprocessing
        !rm train_log*.csv
        !rm datasets.csv        
    except:
        pass
 

if mode == 'C':
    raise Stop
else:
    bgc('WhiteSmoke')

Stop Time:  16:39:15
Elapsed:    2 secs


In [None]:
# import 
import pandas as pd
import pandasklar as pak
grid = pak.grid
import ludwig
import bludwig

## Define Datasets to Train

In [None]:
# get all datasets available
dataset_names = ludwig.datasets.list_datasets()
dataset_names = [x for x in dataset_names if x not in do_not_scan]
print(len(dataset_names),'datasets found')

In [None]:
# scan_datasets
datasets = bludwig.scan_datasets(dataset_names, use_cache=use_cache)

In [None]:
# sort_values
datasets = datasets.sort_values(['status','rows'], ascending=[False,True])
datasets = pak.reset_index(datasets)

In [None]:
# already done?
try:
    done = set(pd.read_csv('train_log_big.csv').set_index('name').T.experiment_name)
    print('done:',done)
    mask = datasets.dataset_name.isin(done)
    datasets.loc[mask,'status'] = 'done'    
except:
    print('nothing')
    done = []

In [None]:
# Examine datasets ok so far
mask = datasets.status == 'ok'
grid(datasets[mask])

In [None]:
# Examine datasets done / errors
mask = datasets.status != 'ok'
grid(datasets[mask])

## Train

In [None]:
# try to continue train_log_all
try:
    train_log_all = pd.read_csv('train_log_raw.csv')
except:
    train_log_all = pd.DataFrame()

In [None]:
# main loop

for i, dataset in datasets.iterrows():
    
    experiment_name = dataset.dataset_name
    
    if dataset.rows <= 0:
        continue

    if dataset.status in ['done']:
        continue      

    if experiment_name in do_not_scan:
        continue      

    if dataset.rows > rows_max:
        mask = datasets.dataset_name == experiment_name
        datasets.loc[mask, 'status'] = 'too big'        
        continue        

    print('\n\n\nTrain', experiment_name)

    # load_dataset
    try:
        data_df, dataset_loader = bludwig.load_dataset(experiment_name)
        print(data_df.shape[0], 'records for training, validation and test')
        mask = datasets.dataset_name == experiment_name
        datasets.loc[mask, 'status'] = 'ok load'

        
    except:
        mask = datasets.dataset_name == experiment_name
        datasets.loc[mask, 'status'] = 'Error load'
        continue
    

    # configs & initialize
    try:
        configs = bludwig.configs(data_df, dataset_loader)
        for i, c in enumerate(configs):
            print('\n====== config for model {} ======\n'.format( experiment_name + '_' + str(i)))
            print(c)
            print('\n\n')
        ludwig_job = bludwig.LudwigJob( configs=configs, experiment_name=experiment_name, verbose=False) 
        mask = datasets.dataset_name == experiment_name
        datasets.loc[mask, 'status'] = 'ok init'

        
    except:
        mask = datasets.dataset_name == experiment_name
        datasets.loc[mask, 'status'] = 'Error init'
        continue        

          

    # train & log
    try:
        
        ludwig_job.experiment(data_df)
        train_log_all = pak.add_rows( train_log_all, ludwig_job.train_log )
        bludwig.train_log_to_csv(train_log_all)  

        # datasets
        mask = datasets.dataset_name == experiment_name
        datasets.loc[mask, 'status'] = 'ok train'
        for feld in ['epochs','train_secs','train_time','loss']:
            mask_log = ludwig_job.train_log.name == feld
            value = ludwig_job.train_log[mask_log].iloc[0].value
            datasets.loc[mask, feld] = value
        datasets.to_csv( 'datasets.csv', index=False) 

    except:
        mask = datasets.dataset_name == experiment_name
        datasets.loc[mask, 'status'] = 'Error train'
        datasets.to_csv( 'datasets.csv', index=False)         
        continue            

## Small Cleanup

In [None]:
try:
    !rm *.meta.json
    !rm *.hdf5     
    !rm .lock_preprocessing     
except:
    pass

## Examine datasets

In [None]:
# Errors
grid(datasets)

In [None]:
# Error train
mask = datasets.status == 'Error train'
datasets[mask]

In [None]:
# Inkonsistent
mask = train_log_all.name == 'experiment_name'
experiment_names = set(train_log_all[mask].value)
#experiment_names

mask1 = ~datasets.dataset_name.isin(experiment_names)
mask2 = ~datasets.status.isin(['Error train','Error loading'])
mask = mask1  &  mask2
datasets[mask]

## Examine logs

In [None]:
train_log = bludwig.train_log( train_log_all, T=True )
train_log.sort_values(['validation_metric','model'])

In [None]:
train_log_big = bludwig.train_log_big( train_log_all )
train_log_big

In [None]:
?pak.add_rows

In [None]:
train_log_raw = train_log_all
mask = train_log_raw.name == 'validation_metric'  
validation_metrics = train_log_raw[mask]

mask = pak.isin(train_log_raw, validation_metrics, left_on=['model_name','name'], right_on=['model_name','value'])
df = train_log_raw[mask].copy()
df['name'] = 'target_value'
train_log_raw = pak.add_rows(train_log_raw, df, only_new=['model_name','name','value'])

In [None]:
train_log_raw

## Colab: Copy back

In [None]:
if not environment['in_colab']:
    raise Stop

In [None]:
# mount
if environment['in_colab']:
    from google.colab import drive
    drive.mount('/content/drive')  

In [None]:
# mount
if environment['in_colab']:
    from google.colab import drive
    drive.mount('/content/drive')  

In [None]:
if environment['in_colab']:
    !ls

In [None]:
if environment['in_colab']:
    !ls results

In [None]:
# results: copy to GDrive
if environment['in_colab']:
  results_path = gdrive_base + '/results'
  !cp -a results $results_path 
  !cp train_log*.csv $gdrive_base 

In [None]:
if environment['in_colab']:
    print('Copy back manually:')
    print('1) This ipynb-File, if changed')
    print('2) results directory')
    print('3) train_log file')    
else:
    print('finished')

In [None]:
raise Stop