In [1]:
import os
if os.environ["HYPERDRIVE_IMAGE"].split("-")[1] in ("local", "dev"):
    cur_dir = os.getcwd()
    os.chdir("/home/jovyan")
    from hdsdk import Hyperdrive
    hyperdrive = Hyperdrive()
    os.chdir(cur_dir)

In [2]:
import os 
compute, environment, *flavor = os.environ["HYPERDRIVE_IMAGE"].split("-")
flavor = flavor[-1]
if flavor == "scipy":
    flavor = "sklearn"
print(os.environ["HDSDK_VERSION"])
if environment == "test":
    from importlib.metadata import version 
    print("Installed SDK version:", version("hdsdk"))
    environment = "qa"
print(compute, environment, flavor)

In [3]:
datarepo_name = "Health Byte - Dev"
project_name = "Customer Segmentation"

In [4]:
from uuid import uuid4
from sklearn.model_selection import train_test_split
import ipynbname
unique = str(uuid4())[-6:]
hyperdrive.project_name

'Customer Segmentation'

In [5]:
assert hyperdrive.project_name == project_name
print("Project Correctly Assigned")

In [6]:
datarepos = hyperdrive.list_datarepos()
assert len(datarepos[datarepos.name == datarepo_name]) > 0
print(f"{datarepo_name} DataRepo available.")

In [7]:
hyperdrive.set_default_datarepo(datarepo_name=datarepo_name)
assert hyperdrive.datarepo.name == datarepo_name
print(f"Default DataRepo '{datarepo_name}' Assigned.")

In [8]:
datasets = hyperdrive.list_datasets()
assert datasets[datasets.base_name.isin(["ht_agg.csv", "user_data.csv"])].shape[0] == 2, "Datasets not uploaded via UI Successfully. The datasets ht_agg.csv and user_data.csv are available at https://github.com/gohypergiant/hyperdrive-sdk/tree/main/tests/data."
print("Datasets available.")

In [9]:
hyperdrive.delete_dataset("merged_anonymous.parquet")
ht_agg_df = hyperdrive.load_dataset("ht_agg.csv", "csv")
user_data_df  = hyperdrive.load_dataset("user_data.csv", "csv")
user_data_df = user_data_df.set_index("_id")
ht_agg_df = ht_agg_df.set_index("_id")
merged_df = ht_agg_df.merge(user_data_df, left_index=True, right_index=True)
merged_df = merged_df.drop(["first_name", "last_name"], axis=1)
hyperdrive.write_dataset(merged_df, "merged_anonymous")
datasets = hyperdrive.list_datasets()
df = hyperdrive.load_dataset("merged_anonymous.parquet")
lifestyle_counts = df.value_counts("lifestyle").to_dict()
assert lifestyle_counts["cardio trainer"] == 75
print("Merged Anonyous Data Successfully Written and Read.")

In [10]:
import pandas as pd
X, y = df.drop('lifestyle',axis=1), df['lifestyle']
y_dummies = pd.get_dummies(y, prefix="lifestyle")

if flavor == "sklearn":
    from sklearn.ensemble import RandomForestClassifier
    model = hyperdrive.get_or_create_model(model_name=f"Customer Segmentation - Sklearn {unique}")
    X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, random_state=42, test_size=0.1)
    candidate_model = RandomForestClassifier()
    scorer = candidate_model.score
    fit_params = {"X": X_train, "y": y_train}
    
elif flavor == "xgboost":
    from xgboost import XGBClassifier
    X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, random_state=42, test_size=0.1)
    model = hyperdrive.get_or_create_model(model_name=f"Customer Segmentation - XGBoost {unique}")
    candidate_model = XGBClassifier()
    scorer = candidate_model.score
    fit_params = {"X": X_train, "y": y_train}
    
elif flavor == "tensorflow":    
    import tensorflow as tf
    from tensorflow import keras 
    
    model = hyperdrive.get_or_create_model(model_name=f"Customer Segmentation - Tensorflow {unique}")
    X_train, X_test, y_train, y_test = train_test_split(X.values, y_dummies.values, random_state=42, test_size=0.1)
    candidate_model = tf.keras.models.Sequential(
            [
                keras.layers.Dense(
                    7, activation="relu", input_shape=(4,)
                ),
                keras.layers.Dense(5, activation="relu"),
                keras.layers.Dense(3, activation="softmax"),
            ]
        )
    candidate_model.compile(
            optimizer="adam",
            loss="categorical_crossentropy",
            metrics=["accuracy"],
        )
    scorer = candidate_model.evaluate
    fit_params = {"x": X_train, "y": y_train, "epochs": 10, "verbose": 0}

elif flavor == "pytorch": 
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import Dataset, DataLoader
    model = hyperdrive.get_or_create_model(model_name=f"Customer Segmentation - Pytorch {unique}")
    X_train, X_test, y_train, y_test = train_test_split(X.values, y_dummies.values, random_state=42, test_size=0.1)
    max_depth = 4

    class Data(Dataset):
        def __init__(self, X_train, y_train):
            self.x=torch.from_numpy(X_train).float()
            self.y=torch.from_numpy(y_train)
            self.len=self.x.shape[0]
        def __getitem__(self,index):      
            return self.x[index], self.y[index]
        def __len__(self):
            return self.len

    data_set=Data(X_train, y_train)
    trainloader=DataLoader(dataset=data_set, batch_size=32)

    class create_torch_model(nn.Module):
        def __init__(self, num_feature, num_class):
            super(create_torch_model, self).__init__()

            self.layer_1 = nn.Linear(num_feature, max_depth)
            self.layer_2 = nn.Linear(max_depth, 5)
            self.layer_out = nn.Linear(5, num_class) 

            self.relu = nn.ReLU()
            self.batchnorm1 = nn.BatchNorm1d(max_depth)
            self.batchnorm2 = nn.BatchNorm1d(5)

        def forward(self, inputs):
            x = self.layer_1(inputs)
            x = self.batchnorm1(x)
            x = self.relu(x)

            x = self.layer_2(x)
            x = self.batchnorm2(x)
            x = self.relu(x)

            x = self.layer_out(x)

            return x

    candidate_model = create_torch_model(num_feature = 4, num_class=3)

    criterion = nn.L1Loss()
    optimizer = optim.Adam(candidate_model.parameters(), lr=0.1)

In [11]:
experiment = model.get_or_create_experiment(
    experiment_name = f"lifestyle_predictor_RF_{unique}",
    enable_drift_monitoring=True,
    train_features=X_train,
    train_target=y_train,
    feature_names=list(X.columns),
    data_exploration_file="/home/jovyan/regression-tests/00-none-smoke-test-eda.ipynb",
    data_preparation_file="/home/jovyan/regression-tests/00-none-smoke-test-preparation.ipynb",
    model_training_file="/home/jovyan/regression-tests/00-all-smoke-test.ipynb",
)