## Materialization Correctness
To ensure the materialization code is working correctly, in this notebook, we create a medium size graph with several ml models and manually check the potential, recreation cost, and |pipelines| to compare with the computed values from the materialization code.

In [1]:
# numpy and pandas for data manipulation
import sys
import os
# Suppress warnings
import warnings

import matplotlib.pyplot as plt
import networkx as nx

import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import cPickle as pickle

warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
%matplotlib inline

ROOT = '/Users/bede01/Documents/work/phd-papers/ml-workload-optimization/'
ROOT_PACKAGE_DIRECTORY = '/Users/bede01/Documents/work/phd-papers/ml-workload-optimization/code/collaborative-optimizer'
root_data = ROOT + '/data'

sys.path.append(ROOT_PACKAGE_DIRECTORY)
from experiment_graph.executor import CollaborativeExecutor
from experiment_graph.workload import Workload
from experiment_graph.execution_environment import ExecutionEnvironment
from experiment_graph.benchmark_helper import BenchmarkMetrics

DATABASE_PATH = root_data + '/experiment_graphs/home-credit-default-risk/materialization-test'
N_ESTIMATOR = 100

In [2]:
class MaterializationTest(Workload):
    def run(self, execution_environment, root_data):
        # Load Data
        train = execution_environment.load(root_data +
                                           '/openml/task_id=31/train.csv')
        test = execution_environment.load(root_data +
                                          '/openml/task_id=31/test.csv')

        test_labels = test['class']
        test = test.drop('class')

        train_labels = train['class']
        train = train.drop(columns=['class'])

        train2 = train.drop('checking_status')
        test2 = test.drop('checking_status')
        from experiment_graph.sklearn_helper.preprocessing import MinMaxScaler
        scaler2 = MinMaxScaler(feature_range=(0, 1))
        scaler2.fit(train2)
        train2 = scaler2.transform(train2)
        test2 = scaler2.transform(test2)

        scaler = MinMaxScaler(feature_range=(0, 1))
        scaler.fit(train)
        train = scaler.transform(train)
        test = scaler.transform(test)
        # Random Forest 1 (n_estimator = 10)
        from experiment_graph.sklearn_helper.ensemble import RandomForestClassifier
        random_forest10 = RandomForestClassifier(
            n_estimators=10, random_state=50, verbose=1, n_jobs=-1)
        random_forest10.fit(train, train_labels)

        # Execute
        random_forest10.trained_node.data()

        print 'random_forest10: {}'.format(
            random_forest10.score(test, test_labels).data())

        # Random Forest 2 (n_estimator = 100)
        from experiment_graph.sklearn_helper.ensemble import RandomForestClassifier
        random_forest100 = RandomForestClassifier(
            n_estimators=100, random_state=50, verbose=1, n_jobs=-1)
        random_forest100.fit(train, train_labels)

        # Execute
        random_forest100.trained_node.data()

        print 'random_forest100: {}'.format(
            random_forest100.score(test, test_labels).data())

        # Logistic Regression
        from experiment_graph.sklearn_helper.linear_model import LogisticRegression

        # Make the model with the specified regularization parameter
        log_reg = LogisticRegression(C=0.0001)

        # Train on the training data
        log_reg.fit(train, train_labels)

        print 'log_reg: {}'.format(log_reg.score(test, test_labels).data())

        # Gradient Boosted 1 (n_estimator = 60)
        from experiment_graph.sklearn_helper.sklearn_wrappers import LGBMClassifier

        model60 = LGBMClassifier(
            n_estimators=60,
            objective='binary',
            class_weight='balanced',
            learning_rate=0.05,
            reg_alpha=0.1,
            reg_lambda=0.1,
            subsample=0.8,
            n_jobs=-1,
            random_state=50)

        # Train the model
        model60.fit(
            train,
            train_labels,
            custom_args={
                'eval_metric': 'accuracy',
                'verbose': 200
            })

        # Make predictions
        print 'model60: '.format(model60.score(test, test_labels).data())

        # Gradient Boosted 2 (n_estimator = 100)
        from experiment_graph.sklearn_helper.sklearn_wrappers import LGBMClassifier

        model100 = LGBMClassifier(
            n_estimators=100,
            objective='binary',
            class_weight='balanced',
            learning_rate=0.05,
            reg_alpha=0.1,
            reg_lambda=0.1,
            subsample=0.8,
            n_jobs=-1,
            random_state=50)

        # Train the model
        model100.fit(
            train,
            train_labels,
            custom_args={
                'eval_metric': 'accuracy',
                'verbose': 200
            })

        # Make predictions
        print 'model100: {}'.format(model100.score(test, test_labels).data())

        # Gradient Boosted 3 (n_estimator = 60) modified train
        from experiment_graph.sklearn_helper.sklearn_wrappers import LGBMClassifier

        model60_2 = LGBMClassifier(
            n_estimators=60,
            objective='binary',
            class_weight='balanced',
            learning_rate=0.05,
            reg_alpha=0.1,
            reg_lambda=0.1,
            subsample=0.8,
            n_jobs=-1,
            random_state=50)

        # Train the model
        model60_2.fit(
            train2,
            train_labels,
            custom_args={
                'eval_metric': 'accuracy',
                'verbose': 200
            })

        # Make predictions
        print 'model60_2: {}'.format(
            model60_2.score(test2, test_labels).data())

In [3]:
# Prepare the Data
# Create Execution Environment with AllMaterializer
ee = ExecutionEnvironment('dedup')
workload = MaterializationTest()
executor = CollaborativeExecutor(ee)
executor.end_to_end_run(workload=workload, root_data=root_data)

creating a new root node
creating a new root node


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.0s finished


random_forest10: {'accuracy': 0.79000000000000004}


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


random_forest100: {'accuracy': 0.76000000000000001}
log_reg: {'accuracy': 0.69999999999999996}
model60: 
model100: {'accuracy': 0.81000000000000005}
model60_2: {'accuracy': 0.78000000000000003}
details of test.csv{}
test.csv{}:0
details of 1F22A9E7687B4B9E5ED728910FC93F1A
1F22A9E7687B4B9E5ED728910FC93F1A:0.085
details of C42F2CDE6885202579E291D6BC09DAA6
C42F2CDE6885202579E291D6BC09DAA6:1.039
details of 0FC9E6AC48D4FCE5147BEB34DDDDFD50
0FC9E6AC48D4FCE5147BEB34DDDDFD50:2.294
details of train.csv{}
train.csv{}:0
details of DBDC4A474E8B084B17D4A95D82CBA082
DBDC4A474E8B084B17D4A95D82CBA082:0.201
details of 991F3690053BFE13CEDF335CFB510266
991F3690053BFE13CEDF335CFB510266:1.021
details of B6D925223AFD41F381EEEE2967E26DA5
B6D925223AFD41F381EEEE2967E26DA5:2.185
details of 1D4D55E9CF2EB86188615EFCAD918B8C
1D4D55E9CF2EB86188615EFCAD918B8C:3.309
details of 5C9271621A96AD66CB0D341714883C85
5C9271621A96AD66CB0D341714883C85:5.494
details of 596B0B62E30B223C703490CF5D545DAF
596B0B62E30B223C703490CF5D

AttributeError: 'str' object has no attribute 'put'

In [None]:
executor.execution_environment.experiment_graph.plot_graph(
    plt,
    figsize=(14, 20),
    labels_for_vertex=['size'],
    labels_for_edges=['name'],
    vertex_size=1200)

## Simple Based Materialization

In [None]:
from experiment_graph.materialization_algorithms.materialization_methods import HeuristicsMaterializer
budget = 1024
heuristicsMat = HeuristicsMaterializer(storage_budget=budget)

In [None]:
# Prepare the Data
# Create Execution Environment with AllMaterializer
ee = ExecutionEnvironment('dedup')
workload = MaterializationTest()
executor = CollaborativeExecutor(ee, heuristicsMat)
executor.end_to_end_run(workload=workload, root_data=root_data)

In [None]:
executor.execution_environment.experiment_graph.plot_graph(
    plt,
    figsize=(14, 20),
    labels_for_vertex=['size'],
    labels_for_edges=['name'],
    vertex_size=1200)

In [None]:
print 'Total size of all the artifacts: {}'.format(
    executor.execution_environment.experiment_graph.get_total_size())

print 'Total size of all the materialized artifacts: {}'.format(
    executor.execution_environment.experiment_graph.get_real_size())

print 'Sum of size of all the materialized artifacts: {}'.format(
    executor.execution_environment.experiment_graph.
    get_total_materialized_size())

## Storage Aware Materialization

In [None]:
from experiment_graph.materialization_algorithms.materialization_methods import StorageAwareMaterializer
budget = 1024
heuristicsMat = StorageAwareMaterializer(storage_budget=budget)

In [None]:
# Prepare the Data
# Create Execution Environment with AllMaterializer
ee = ExecutionEnvironment('dedup')
workload = MaterializationTest()
executor = CollaborativeExecutor(ee, heuristicsMat)
executor.end_to_end_run(workload=workload, root_data=root_data)

In [None]:
executor.execution_environment.experiment_graph.plot_graph(
    plt,
    figsize=(14, 20),
    labels_for_vertex=['size'],
    labels_for_edges=['name'],
    vertex_size=1200)

In [None]:
print 'Total size of all the artifacts: {}'.format(
    executor.execution_environment.experiment_graph.get_total_size())

print 'Total size of all the materialized artifacts: {}'.format(
    executor.execution_environment.experiment_graph.get_real_size())

print 'Real size of all the materialized artifacts: {}'.format(
    executor.execution_environment.experiment_graph.
    get_total_materialized_size())

In [None]:
graph = executor.execution_environment.experiment_graph.graph