In [None]:
#!pip install tensorflow #==2.12 keras==2.12
!pip install tensorflow[and-cuda]==2.14
!pip install ray[train]

!pip install snowflake-snowpark-python==1.25.0 pandas notebook scikit-learn cachetools pyarrow==10.0.1 python-dotenv

In [None]:
import tensorflow as tf
print(tf.__version__)

In [None]:
import ray
from ray import train
from ray.train.tensorflow import TensorflowTrainer
from ray.train import Checkpoint, ScalingConfig
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from snowflake.ml.data.data_connector import DataConnector


# Initialize Snowflake session
from snowflake.snowpark.context import get_active_session
session = get_active_session()

In [None]:
use role accountadmin;
create schema if not exists tensorflow_data.feature_store;
grant usage on database tensorflow_data to sysadmin;
grant usage on schema tensorflow_data.public to sysadmin;
grant usage on schema tensorflow_data.feature_store to sysadmin;
grant create dynamic table, create tag, create stage, create view on schema tensorflow_data.feature_store to sysadmin;
grant select on table tensorflow_data.public.PENGUINS to sysadmin;


In [None]:
CREATE SEQUENCE If NOT EXISTS tensorflow_data.public.my_table_seq START = 1 INCREMENT = 1;
ALTER TABLE tensorflow_data.public.PENGUINS ADD COLUMN id INT;
UPDATE tensorflow_data.public.PENGUINS SET id = my_table_seq.nextval;


In [None]:
from snowflake.ml.feature_store import FeatureStore, CreationMode, Entity, FeatureView

fs = FeatureStore(
    session=session,
    database='tensorflow_data',
    name='feature_store',
    default_warehouse='QUICKSTART_WH',
    creation_mode=CreationMode.CREATE_IF_NOT_EXIST,
)

In [None]:
entity = Entity(
    name="PENGUIN_ID",
    join_keys=["id"],
    desc="Penguin ID"
)

fs.register_entity(entity)

# Show our newly created entity
# snowpark.DataFrame.show() is another way to preview the DataFrame contents
fs.list_entities().show()

In [None]:
select distinct SPECIES from TENSORFLOW_DATA.PUBLIC.PENGUINS --where SEX in ('male', 'female');

In [None]:
create or replace stage TENSORFLOW_DATA.FEATURE_STORE.UDF_STAGE
DIRECTORY = ( ENABLE = TRUE )
COMMENT = 'Stage for UDF functions'; 

In [None]:
from snowflake.snowpark.functions import udf
from snowflake.snowpark.types import IntegerType

@udf(return_type=IntegerType(),name="penguin_species_to_int", is_permanent=True, replace=True, stage_location="@TENSORFLOW_DATA.FEATURE_STORE.UDF_STAGE")  # Specify the return data type
def penguin_species_to_int(species: str):
    if species == "Adelie":
        return 0
    elif species == "Gentoo":
        return 1
    elif species == "Chinstrap":
        return 2
    else:  # Handle other species or NULLs appropriately
        return -1  # Or raise an exception, or return NULL, depending on your needs

In [None]:
from snowflake.snowpark import functions as F

table_name = 'TENSORFLOW_DATA.PUBLIC.PENGUINS'
snowpark_df = session.table(table_name).filter(F.col("SEX").in_('male','female'))
#snowpark_df = session.table(table_name).dropna(subset=['BILL_LENGTH_MM', 'BILL_DEPTH_MM','FLIPPER_LENGTH_MM','BODY_MASS_G'])

feature_df = snowpark_df.select(                    
                   F.col("ID"), 
                   F.col("BILL_LENGTH_MM").cast("float").alias("BILL_LENGTH_MM"),
                   F.col("BILL_DEPTH_MM").cast("float").alias("BILL_DEPTH_MM"),
                   (F.col("FLIPPER_LENGTH_MM").cast("float")/10).alias("FLIPPER_LENGTH_MM"), #normalize
                   (F.col("BODY_MASS_G").cast("float")/100).alias("BODY_MASS_G"), #normalize
                    F.col("SPECIES")
                   #F.call_udf("TENSORFLOW_DATA.FEATURE_STORE.PENGUIN_SPECIES_TO_INT",F.col("SPECIES")).alias("SPECIES_INT"),                   
                  )

feature_df.show()

In [None]:
import snowflake.snowpark.functions as F
from snowflake.snowpark.window import Window


# Create a window partitioned by 'ID'
window_spec = Window.partition_by("ID").order_by("SPECIES")

# Calculate the standard deviation using the window function
result_df = feature_df.with_column(
    "STDDEV_BODY_MASS_G",
    F.stddev(F.col("BODY_MASS_G").cast("float")).over(window_spec)
)

# Show the results (optional)
result_df.show()

# If you only want the id and the standard deviation, you can select only those columns.
final_result_df = result_df.select("ID", "STDDEV_BODY_MASS_G")
final_result_df.show()

# If you want the population standard deviation, use stddev_pop instead of stddev
result_pop_df = feature_df.with_column(
    "STDDEV_POP_BODY_MASS_G",
    F.stddev_pop(F.col("BODY_MASS_G").cast("float")).over(window_spec)
)

result_pop_df.show()

final_pop_result_df = result_pop_df.select("ID", "STDDEV_POP_BODY_MASS_G")
final_pop_result_df.show()

In [None]:
pen_fv = FeatureView(
    name="Penguin_Data",
    entities = [entity],
    feature_df = feature_df    
)

pen_nn_fv = fs.register_feature_view(pen_fv, version="1", overwrite=True)

In [None]:
spine_df = session.table("TENSORFLOW_DATA.PUBLIC.PENGUINS")
spine_df = spine_df.filter(F.col("SEX").in_('male','female')).select(F.col("ID"))

training_dataset = fs.generate_dataset(
    name="PENGUIN_TRAINING_DATASET",
    spine_df = spine_df,
    features=[pen_nn_fv],
)

In [None]:
#Loading training Data Set
penguins = DataConnector.from_dataset(training_dataset).to_pandas()

# Convert to Ray Dataframe
penguins_ray_ds = ray.data.from_pandas(penguins)

In [None]:
features = ["BILL_LENGTH_MM", "BILL_DEPTH_MM", "FLIPPER_LENGTH_MM" , "BODY_MASS_G"]
label = 'SPECIES'

def one_hot_encode(row):
    row['features'] = [float(row['BILL_LENGTH_MM']) , float(row['BILL_DEPTH_MM']), float(row['FLIPPER_LENGTH_MM'])
                       , float(row['BODY_MASS_G'])]
    
    match row['SPECIES']:
        case 'Adelie':
            row['label'] = [1, 0, 0]
        case 'Gentoo':
            row['label'] = [0, 1, 0]
        case 'Chinstrap':
            row['label'] = [0, 0, 1]
    return row

#selecting only features and label arrays
penguins_ray_ds = penguins_ray_ds.map(one_hot_encode).select_columns(['features','label'])

In [None]:
known_to_ray_gpus = ray.cluster_resources()

gpu_count = int(known_to_ray_gpus.get("GPU",0))

print("Total GPUS known to Ray:", gpu_count)

In [None]:
import ray
import pandas as pd
import numpy as np


# Split into train test split
train_rs, test_rs = penguins_ray_ds.train_test_split(test_size=0.2, shuffle=True, seed=42)  # Split into 2 datasets

In [None]:
import tensorflow
from tensorflow import keras
from tensorflow.keras import models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Set random seed for reproducability
tensorflow.random.set_seed(0)

print("Libraries imported.")
print('TensorFlow version:',tensorflow.__version__)

from matplotlib import pyplot as plt

epoch_nums = range(1,num_epochs+1)
training_loss = history.history["loss"]
validation_loss = history.history["val_loss"]
plt.plot(epoch_nums, training_loss)
plt.plot(epoch_nums, validation_loss)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['training', 'validation'], loc='upper right')
plt.show()

In [None]:
import os
import tempfile, json
import tensorflow as tf
from tensorflow.keras import optimizers

from ray import train
from ray.train import Checkpoint, ScalingConfig
from ray.train.tensorflow import TensorflowTrainer

penguin_classes = ['Adelie', 'Gentoo', 'Chinstrap']
features = ["BILL_LENGTH_MM", "BILL_DEPTH_MM", "FLIPPER_LENGTH_MM" , "BODY_MASS_G"]
label = 'SPECIES_INT'

def build_model():
    # Define a classifier network
    hl = 10 # Number of hidden layer nodes

    model = Sequential()
    model.add(Dense(hl, input_dim=len(features), activation='relu'))
    model.add(Dense(hl, input_dim=hl, activation='relu'))
    model.add(Dense(len(penguin_classes), input_dim=hl, activation='softmax'))    
    return model

def train_loop_per_worker(config):
    print(config)
    dataset_shard = train.get_dataset_shard("train")
          
    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
    
    with strategy.scope():
        model = build_model()
        #hyper-parameters for optimizer
        learning_rate = 0.001
        opt = optimizers.Adam(lr=learning_rate)
        
        model.compile(             
            loss="categorical_crossentropy", 
            optimizer=opt, metrics=["accuracy"]
        )
        
        print(model.summary())

    tf_dataset = dataset_shard.to_tf(        
        feature_columns='features',
        label_columns='label',
        batch_size=10
    )
        
    for epoch in range(config["num_epochs"]):
        history = model.fit(tf_dataset)

        #saving model for later loading
        with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
            model.save(os.path.join(temp_checkpoint_dir, "model.keras"))
            checkpoint_dict = os.path.join(temp_checkpoint_dir, "checkpoint.json")
            with open(checkpoint_dict, "w") as f:
                json.dump({"epoch": epoch}, f)
            checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)

            train.report({"loss": history.history["loss"][0], "accuracy": history.history["accuracy"][0]}, checkpoint=checkpoint)

trainer = TensorflowTrainer(
    train_loop_per_worker=train_loop_per_worker,
    scaling_config=ScalingConfig(num_workers= gpu_count, use_gpu=True),
    datasets={"train": train_rs},
    train_loop_config={"num_epochs": 50},
)

result = trainer.fit()

In [None]:
trained_model = tf.keras.models.load_model(result.checkpoint.path+'/model.keras')

In [None]:
sample_data = test_rs.to_tf(feature_columns="features", label_columns="label")

#print(test_rs.to_pandas())

res = trained_model.predict(x=sample_data)

np.set_printoptions(suppress=True)
#print(res)

dataset = tf.data.Dataset.from_tensor_slices(res)

def find_max_index(array):
    return tf.argmax(array)  # No need to specify axis for a 1D array
    
# Apply the map function
dataset = dataset.map(find_max_index)

# Iterate and print the results
print(penguin_classes)
for index in dataset:
    print(penguin_classes[index.numpy()])  

In [None]:
from snowflake.ml.registry import Registry

reg = Registry(session=session, database_name="TENSORFLOW_DATA", schema_name="FEATURE_STORE")

In [None]:

input = {'features':[[1.,2.,3.,4.]]}
output = {'label':[[1,0,0]]}

input_df = pd.DataFrame(input)
output_df = pd.DataFrame(output)

from snowflake.ml.model import model_signature

#df = test_rs.to_pandas()


predict_signature = model_signature.infer_signature(input_data=input_df, output_data=output_df)

print(predict_signature)

mv = reg.log_model(
    trained_model, 
    model_name ='tf_pengiun_model', 
    version_name='v1', 
    pip_requirements=["tensorflow[and-cuda]==2.14"],
    signatures={"predict": predict_signature},
    #sample_input_data= df,
    target_platforms=['SNOWPARK_CONTAINER_SERVICES']
)
 

In [None]:
import os
import tempfile
import tensorflow as tf

import ray
from ray import train
from ray.train import Checkpoint, ScalingConfig
from ray.train.tensorflow import TensorflowTrainer

def build_model():
    # toy neural network : 1-layer
    return tf.keras.Sequential(
        [tf.keras.layers.Dense(
            1, activation="linear", input_shape=(1,))]
    )

def train_loop_per_worker(config):
    dataset_shard = train.get_dataset_shard("train")
    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
    with strategy.scope():
        model = build_model()
        model.compile(
            optimizer="Adam", loss="mean_squared_error", metrics=["mse"])

    tf_dataset = dataset_shard.to_tf(
        feature_columns="x",
        label_columns="y",
        batch_size=1
    )
    for epoch in range(config["num_epochs"]):
        model.fit(tf_dataset)

        # Create checkpoint.
        checkpoint_dir = tempfile.mkdtemp()
        model.save_weights(
            os.path.join(checkpoint_dir, "my_checkpoint")
        )
        checkpoint = Checkpoint.from_directory(checkpoint_dir)

        train.report(
            {},
            checkpoint=checkpoint,
        )

train_dataset = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)])

print(train_dataset)

trainer = TensorflowTrainer(
    train_loop_per_worker=train_loop_per_worker,
    scaling_config=ScalingConfig(num_workers=3, use_gpu=True),
    datasets={"train": train_dataset},
    train_loop_config={"num_epochs": 4},
)
#result = trainer.fit()

In [None]:
SELECT CURRENT_ACCOUNT_LOCATOR();