In [217]:
import mlflow
import pandas as pd
import numpy as np
import argparse
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras import Model
from tensorflow.keras.datasets.mnist import load_data
from ray import tune
import os
import datetime
import uuid
import time
import logging
idx = pd.IndexSlice

# Setting MLFlow

In [245]:
EXPERIMENT_NAME = 'Try_3'
#mlflow.set_tracking_uri(URI) # If you have an instance
mlflow.set_experiment(EXPERIMENT_NAME)
print("Experiment Name: ", EXPERIMENT_NAME)

INFO: 'Try_3' does not exist. Creating a new experiment
Experiment Name:  Try_3


# Building params

In [246]:
# INPUTS
DATA_PATH = '/Users/camilovelasquez/Desktop/Documents/Datasets/WISDM-Smartphones/wisdm-dataset/raw'
ids = np.arange(1600, 1650)
devices = ['phone']
sensors = ['accel']
activities = ['A', 'B']
time_taken = 3000
time_split = 100

train_batch_size = 8
eval_batch_size = 16
epochs = 5

train_size = 2500
valid_size = 220
test_size = 220

""" 
Data : 
    Source of Dataset, Features Added, Structure, etc

Splitting (Adequating tf.datasets) :
        Train size, test size, batch_size, epochs, etc

Preprocessing :
        Sampling, Normalization, Scalers, Imputers, etc
        
Model : 
    Model architecture, characteristics, attributes
Loss : 
    Loss functions, custom losses.
Optimizers : 
    Optimizers and its parameters
Metrics : 
    Metrics and custom metrics
train : 
    Class weights, callbacks, steps per epoch
"""
                
params = {'data': {'value' : DATA_PATH, 
                   'params' : {'ids' : ids, 
                               'devices' : devices, 
                               'sensors' : sensors, 
                               'activities' : activities, 
                               'time_taken' : time_taken, 
                               'time_split' : time_split}},
          'split': {'value' : None, # IF you have a function with X splliting ways
                    'params' : {'train_size' : train_size, 
                                'valid_size' : valid_size, 
                                'test_size' : test_size, 
                                'train_batch_size' : train_batch_size, 
                                'eval_batch_size' : eval_batch_size,
                                'epochs' : epochs}},
          'preprocess' : {'value' : None, 
                          'params' : {}},
          'model' : {'value' : 'base_conv', 
                     'params' : {'layers' : 3,
                                 'filters_0' : 32, 
                                 'filters_1' : 32, 
                                 'kernel_0': 7, 
                                 'kernel_1' : 9}},
          'loss' : {'value' : 'binary_crossentropy', 
                    'params' : {}},
          'optimizer' : {'value' : 'adam',
                         'params' : {'lr' : 0.001}}, 
          'metrics' : {'value' : 'accuracy', 
                       'params': {}}, 
          'train' : {'value' : None, 
                   'params' : {'class_weight' : None, 
                               'callbacks' : ['es', 'cp'], 
                               'steps_per_epoch' : epochs//train_batch_size}}
         }

In [247]:
# Preprocessing data
def read_WISDM_data(DATA_PATH, ids=np.arange(1600, 1650), 
                    devices=['phone'], sensors=['accel']):
    """Read from DATA PATH and create a pandas table from it"""
    i = 0
    for current_id in ids:
        for current_device in devices:
            for current_sensor in sensors:
                file_path = os.path.join(DATA_PATH, current_device, current_sensor, 
                                         'data_{}_{}_{}.txt'.format(current_id, current_sensor, current_device))
                if i==0:
                    table = pd.read_csv(file_path, delimiter=',', 
                                        names=['ID', 'Activity Label', 'Timestamp', 'x', 'y', 'z'], 
                                        lineterminator='\n')
                else:
                    aux = pd.read_csv(file_path, delimiter=',', 
                                      names=['ID', 'Activity Label', 'Timestamp', 'x', 'y', 'z'], 
                                        lineterminator='\n')
                    table = pd.concat([table, aux], axis=0)
                i+=1
    table.loc[:,'z'] = table.z.str.replace(';','').astype(np.float32)
    return table

def transform_data(table, time_taken, time_split):
    """Transform data from raw table into a zip of (features, labels),
        where features has shape (samples, time_steps, features), and labels (samples,)"""
    table = table.set_index(['ID', 'Activity Label'])
    table = table.groupby(['ID', 'Activity Label']).head(time_taken)
    timestamp_edit = np.tile(np.arange(0,time_split), int(table.shape[0]/time_split))
    table['Timestamp'] = timestamp_edit
    table = table.reset_index().set_index(['ID', 'Activity Label', 'Timestamp'], append=True)
    features = table.values.reshape((int(table.shape[0]/time_split), time_split, table.shape[1]))
    labels = table.reset_index()['Activity Label']\
        .values[np.arange(0,int(table.shape[0]/time_split)*time_split, time_split)]
    return features, labels

def preprocessing_data(table, time_taken=3000, time_split=100, activities=['A', 'B']):
    """Preprocess table and convert it into tf dataset"""
    features, labels = transform_data(table, time_taken=time_taken, time_split=time_split)
    dataset = tf.data.Dataset.from_tensor_slices(({'feature': features}, {'label': labels}))
    ds = dataset.filter(lambda x, y: tf.reduce_any(tf.equal(y['label'], activities))==True)
    ds = ds.map(label2prob)
    return ds

def label2prob(feature, label):
    new_label = tf.where(tf.equal(label['label'], 'A'), 1, 0)
    label['label'] = new_label
    return feature, label

In [248]:
#Model params
def build_model(time_split):
    inputs = tf.keras.Input(shape=(time_split, 3), name='feature')
    x = tf.keras.layers.BatchNormalization(axis=2)(inputs)
    x_1 = tf.keras.layers.Conv1D(filters=32, kernel_size=15)(x)
    x_1 = tf.keras.layers.GlobalMaxPool1D()(x_1)
    x_3 = tf.keras.layers.Conv1D(filters=32, kernel_size=31)(x)
    x_3 = tf.keras.layers.GlobalMaxPool1D()(x_3)
    x_5 = tf.keras.layers.Conv1D(filters=32, kernel_size=63)(x)
    x_5 = tf.keras.layers.GlobalMaxPool1D()(x_5)
    x = tf.keras.layers.Concatenate()([x_1, x_3, x_5])
    x = tf.keras.layers.Dense(16, activation='relu')(x)
    output = tf.keras.layers.Dense(1, activation='sigmoid', name='label')(x)
    model = tf.keras.Model(inputs=[inputs], outputs=output)
    return model

def build_optimizer():
    optimizer='adam'
    return optimizer

def build_loss():
    loss='binary_crossentropy'
    return loss

def build_metrics():
    metrics=['accuracy']
    return metrics

def compile_model(model, optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']):
    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
    return model

def train_model(model, train_ds=None, valid_ds=None):
    model.fit()

def myprint(s, filename='modelsummary.txt'):
    with open(filename,'w+') as f:
        print(s, file=f)

In [249]:
def build_callbacks(params):
    callbacks = []
    if 'es' in params['train']['params']['callbacks']:
        es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                               patience=params['split']['params']['epochs']//2,
                                               mode='max',
                                               restore_best_weights=True)
        callbacks.append(es_callback)
    elif 'cp' in params['train']['params']['callbacks']:
        cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                         save_weights_only=True,
                                                         verbose=1)
        callbacks.append(cp_callback)
    return callbacks

In [250]:
def transform_key_value_pair(kv_pairs, key, dictionary):
    if type(dictionary) is dict:
        for new_key, new_value in dictionary.items():
            if key is not None:
                transform_key_value_pair(kv_pairs, key + '_' + str(new_key), new_value)
            else:
                transform_key_value_pair(kv_pairs, new_key, new_value)    
    else:
        kv_pairs[key] = dictionary
        
def example(**args):
    print(args)
    for arg in args:
        print(arg)

In [251]:
date = datetime.datetime.now().strftime('%Y_%m_%d')
summary_filename = 'summary.txt'
run_name = date + str(uuid.uuid4())[:5]
with mlflow.start_run(run_name=run_name) as run:
    start_time = time.time()
    # Params
    params_kvpairs = dict()
    transform_key_value_pair(params_kvpairs, None, params)
    for k, v in params_kvpairs.items():
        if type(v) in [list, dict, str, int, float, bool, tuple]:
            mlflow.log_param(k, v)
    complete_param = time.time()
    
    # Preprocess
    table = read_WISDM_data(DATA_PATH, ids=ids, devices=devices, sensors=sensors)
    ds = preprocessing_data(table, time_taken=time_taken, time_split=time_split, activities=activities)
    complete_preprocess = time.time()
    
    #Model
    model = build_model(time_split=time_split)
    optimizer = build_optimizer()
    loss = build_loss()
    metrics = build_metrics()
    model = compile_model(model=model, optimizer=optimizer, loss=loss, metrics=metrics)
    model.summary(print_fn=lambda x: myprint(x, summary_filename))
    
    #Build dataset
    ds_train = ds.take(train_size)
    ds_train = ds_train.shuffle(buffer_size=TRAIN_BATCH_SIZE)
    ds_train = ds_train.repeat(count=EPOCHS)
    ds_train = ds_train.batch(TRAIN_BATCH_SIZE)
    ds_train = ds_train.prefetch(1)

    ds_valid = ds.skip(train_size).take(valid_size)
    ds_valid = ds_valid.repeat(count=1)
    ds_valid = ds_valid.batch(TRAIN_BATCH_SIZE)
    ds_valid = ds_valid.prefetch(1)

    ds_test = ds.skip(train_size+valid_size)
    ds_test = ds_test.batch(EVAL_BATCH_SIZE)
    
    # Train
    history = model.fit(ds_train, 
                    validation_data=ds_valid,
                    steps_per_epoch=train_size//TRAIN_BATCH_SIZE,  
                    epochs=EPOCHS, 
                    callbacks=callbacks,
                    verbose=1, 
                    shuffle=False)
    complete_train = time.time()
    model.save('model_'+ run_name + '.h5')
    
    #Evaluate
    test_result = model.evaluate(ds_test)
    complete_evaluation = time.time()
    
    # Times
    preprocess_time = complete_preprocess - start_time
    train_time = complete_train - complete_preprocess
    evaluate_time = complete_evaluation - complete_train
    
    #Metrics
    mlflow.log_metric('val_accuracy', max(history.history['val_accuracy']))
    
    #Artifacts
    mlflow.log_artifact(summary_filename)
    
    mlflow.end_run()

Train for 312 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Using params for everything

In [220]:
# INPUTS
DATA_PATH = '/Users/camilovelasquez/Desktop/Documents/Datasets/WISDM-Smartphones/wisdm-dataset/raw'
ids = np.arange(1600, 1650)
devices = ['phone']
sensors = ['accel']
activities = ['A', 'B']
time_taken = 3000
time_split = 100

train_batch_size = 8
eval_batch_size = 16
epochs = 5

train_size = 2500
valid_size = 220
test_size = 220
""" 
Data : 
    Source of Dataset, Features Added, Structure, etc

Splitting (Adequating tf.datasets) :
        Train size, test size, batch_size, epochs, etc

Preprocessing :
        Sampling, Normalization, Scalers, Imputers, etc
        
Model : 
    Model architecture, characteristics, attributes
Loss : 
    Loss functions, custom losses.
Optimizers : 
    Optimizers and its parameters
Metrics : 
    Metrics and custom metrics
train : 
    Class weights, callbacks, steps per epoch
callbacks : 
    Callbacks and its parameters
"""
params = {'data': {'value' : DATA_PATH, 
                   'params' : {'ids' : ids, 
                               'devices' : devices, 
                               'sensors' : sensors, 
                               'activities' : activities, 
                               'time_taken' : time_taken, 
                               'time_split' : time_split}},
          'split': {'value' : None, # IF you have a function with X splliting ways
                    'params' : {'train_size' : train_size, 
                                'valid_size' : valid_size, 
                                'test_size' : test_size, 
                                'train_batch_size' : train_batch_size, 
                                'eval_batch_size' : eval_batch_size,
                                'epochs' : epochs}},
          'preprocess' : {'value' : None, 
                          'params' : {}},
          'model' : {'value' : 'base_conv', 
                     'params' : {'layers' : 3,
                                 'filters_0' : 32, 
                                 'filters_1' : 32, 
                                 'kernel_0': 7, 
                                 'kernel_1' : 9}},
          'loss' : {'value' : 'binary_crossentropy', 
                    'params' : {}},
          'optimizer' : {'value' : 'adam',
                         'params' : {'lr' : 0.001}}, 
          'metrics' : {'value' : 'accuracy', 
                       'params': {}}, 
          'train' : {'value' : None, 
                   'params' : {'class_weight' : None, 
                               'steps_per_epoch' : epochs//train_batch_size}},
          'callbacks' : {'es' : {}, # Early Stopping 
                         'tb' : {}, # TensorBoard
                         'cp' : {}, # Checkpoint 
                         'ls' : {}, # Learning Rate Scheduler
                        }
         }

In [242]:
# Preprocessing data
def read_WISDM_data(data_path, ids=np.arange(1600, 1650), 
                    devices=['phone'], sensors=['accel'], **kwargs):
    """Read from DATA PATH and create a pandas table from it
    Read from params['data']
    """
    i = 0
    for current_id in ids:
        for current_device in devices:
            for current_sensor in sensors:
                file_path = os.path.join(data_path, current_device, current_sensor, 
                                         'data_{}_{}_{}.txt'.format(current_id, current_sensor, current_device))
                if i==0:
                    table = pd.read_csv(file_path, delimiter=',', 
                                        names=['ID', 'Activity Label', 'Timestamp', 'x', 'y', 'z'], 
                                        lineterminator='\n')
                else:
                    aux = pd.read_csv(file_path, delimiter=',', 
                                      names=['ID', 'Activity Label', 'Timestamp', 'x', 'y', 'z'], 
                                        lineterminator='\n')
                    table = pd.concat([table, aux], axis=0)
                i+=1
    table.loc[:,'z'] = table.z.str.replace(';','').astype(np.float32)
    return table

def transform_data(table, time_taken, time_split):
    """Transform data from raw table into a zip of (features, labels),
        where features has shape (samples, time_steps, features), and labels (samples,)"""
    table = table.set_index(['ID', 'Activity Label'])
    table = table.groupby(['ID', 'Activity Label']).head(time_taken)
    timestamp_edit = np.tile(np.arange(0,time_split), int(table.shape[0]/time_split))
    table['Timestamp'] = timestamp_edit
    table = table.reset_index().set_index(['ID', 'Activity Label', 'Timestamp'], append=True)
    features = table.values.reshape((int(table.shape[0]/time_split), time_split, table.shape[1]))
    labels = table.reset_index()['Activity Label']\
        .values[np.arange(0,int(table.shape[0]/time_split)*time_split, time_split)]
    return features, labels

def preprocessing_data(table, time_taken=3000, time_split=100, activities=['A', 'B'], **kwargs):
    """Preprocess table and convert it into tf dataset"""
    features, labels = transform_data(table, time_taken=time_taken, time_split=time_split)
    dataset = tf.data.Dataset.from_tensor_slices(({'feature': features}, {'label': labels}))
    ds = dataset.filter(lambda x, y: tf.reduce_any(tf.equal(y['label'], activities))==True)
    ds = ds.map(label2prob)
    return ds

def label2prob(feature, label):
    new_label = tf.where(tf.equal(label['label'], 'A'), 1, 0)
    label['label'] = new_label
    return feature, label

In [None]:
#Model params
def build_model(time_split, **kwargs):
    inputs = tf.keras.Input(shape=(time_split, 3), name='feature')
    x = tf.keras.layers.BatchNormalization(axis=2)(inputs)
    x_1 = tf.keras.layers.Conv1D(filters=32, kernel_size=15)(x)
    x_1 = tf.keras.layers.GlobalMaxPool1D()(x_1)
    x_3 = tf.keras.layers.Conv1D(filters=32, kernel_size=31)(x)
    x_3 = tf.keras.layers.GlobalMaxPool1D()(x_3)
    x_5 = tf.keras.layers.Conv1D(filters=32, kernel_size=63)(x)
    x_5 = tf.keras.layers.GlobalMaxPool1D()(x_5)
    x = tf.keras.layers.Concatenate()([x_1, x_3, x_5])
    x = tf.keras.layers.Dense(16, activation='relu')(x)
    output = tf.keras.layers.Dense(1, activation='sigmoid', name='label')(x)
    model = tf.keras.Model(inputs=[inputs], outputs=output)
    return model

def build_optimizer():
    optimizer='adam'
    return optimizer

def build_loss():
    loss='binary_crossentropy'
    return loss

def build_metrics():
    metrics=['accuracy']
    return metrics

def compile_model(model, optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']):
    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
    return model

def train_model(model, train_ds=None, valid_ds=None):
    model.fit()

def myprint(s, filename='modelsummary.txt'):
    with open(filename,'w+') as f:
        print(s, file=f)

In [239]:
table = read_WISDM_data(params['data']['value'], **params['data']['params'])

In [244]:
ds = preprocessing_data(table, **params['data']['params'])

In [None]:
mlflow.set_experiment(EXPERIMENT_NAME)
with mlflow.start_run(run_name=run_name) as run:
    mlflow.log_param(k,v)
    mlflow.log_metric(metric, [1,2,3])
    mlflow.set_tag(k, v)
    mlflow.log_artifact('algo.json')
    mlflow.log_artifact('algo.png')
    
    
"""
pip install mlflow


Directory where mlruns folder is located
>> mlflow ui
"""

def transform_key_value_pair(kv_pairs, key, dictionary):
    """Flatten dicts in sum_keys: value single dict"""
    if type(dictionary) is dict:
        for new_key, new_value in dictionary.items():
            if key is not None:
                transform_key_value_pair(kv_pairs, key + '_' + str(new_key), new_value)
            else:
                transform_key_value_pair(kv_pairs, new_key, new_value)    
    else:
        kv_pairs[key] = dictionary

        
params_kvpairs = dict()
transform_key_value_pair(params_kvpairs, None, params)