In [None]:
#this integration is being created to run the kaggle diamond dataset machine learning example (dataset here in kaggle):
#https://www.kaggle.com/datasets/shivam2503/diamonds?resource=download
#this notenook is based on a training session which was run
#by Snowflake SME (subject matter expert) Martin Thorup
# who was teaching the capablities of Snowpark
from snowflake.ml.registry import Registry
# Snowpark for Python
import snowflake.snowpark.functions as F
from snowflake.snowpark.types import DecimalType

import numpy as np
# Override np.float_ with np.float64
np.float_ = np.float64

# Snowpark ML
import snowflake.ml.modeling.preprocessing as snowml
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.metrics.correlation import correlation

# Data science libs
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import json
import joblib

#warning suppression
import warnings; warnings.simplefilter('ignore')





In [None]:
# Get active session (current snowflake session)
from snowflake.snowpark.context import get_active_session
session = get_active_session()

# add a query tag to the session 
session.query_tag = {"origin":"sf_sit-is","name":"e2e_ml_snowparkpython", "version":{"major":1,"minor":0}}

# Set session context
session.use_role("ACCOUNTADMIN")

# get current solution prefix from warehouse name
solution_prefix = session.get_current_warehouse()
#.strip("_").split("_DS_WH")[0]

# Get the current role, warehouse, and database/schema
print(f"Current role: {session.get_current_role()} | Current warehouse: {session.get_current_warehouse()} | DB SCHEMA: {session.sql('select current_database(), current_schema()').collect()}")



In [None]:
# Data Loading
diamonds_df = session.table('DIAMONDS')
diamonds_df

In [None]:
#strip double quotes from column names

# Function to strip double quotes from column names
def strip_double_quotes_from_column_names(df):
    new_columns = [col.replace('"', '') for col in df.columns]
    return df.to_df(*new_columns)

# Apply the function to the DataFrame
diamonds_df = strip_double_quotes_from_column_names(diamonds_df)

In [None]:
# Normalize the CARAT column

snowml_mms = snowml.MinMaxScaler(input_cols=['CARAT'], output_cols=['CARAT_NORM'])
normalized_diamonds_df = snowml_mms.fit(diamonds_df).transform(diamonds_df)
new_col = normalized_diamonds_df.col("CARAT_NORM").cast(DecimalType(7,6))
normalized_diamonds_df.withColumn("CARAT_NORM", new_col)
normalized_diamonds_df

In [None]:
# Encode CUT and CLARITY preserve ordinal importance
#define the categories for each of the columns in the encoder (put them in the order you wish them to be numbered)
# note that 0 is the lowest number in sequence 
categories = {
   "CUT": np.array(["Ideal", "Premium", "Very Good", "Good", "Fair"]),
   "CLARITY": np.array(["IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1", "I2", "I3"]),
}

snowml_oe = snowml.OrdinalEncoder(input_cols=["CUT", "CLARITY"], output_cols=["CUT_OE", "CLARITY_OE"], categories=categories)
# fit the dataframe to the ordinal encoder and generate the output columns
ord_encoded_diamonds_df = snowml_oe.fit(normalized_diamonds_df).transform(normalized_diamonds_df)


print("Result Dataframe:\n")
ord_encoded_diamonds_df

In [None]:

# Encode categoricals to numeric columns
# the one hot encoder pivots the categories in the column to individual columns
snowml_ohe = snowml.OneHotEncoder(input_cols=["CUT", "COLOR", "CLARITY"], output_cols=["CUT_OHE", "COLOR_OHE", "CLARITY_OHE"])
transformed_diamonds_df = snowml_ohe.fit(ord_encoded_diamonds_df).transform(ord_encoded_diamonds_df)
transformed_diamonds_df


In [None]:
# Categorize all the features for processing
CATEGORICAL_COLUMNS = ["CUT", "COLOR", "CLARITY"]
CATEGORICAL_COLUMNS_OE = ["CUT_OE", "COLOR_OE", "CLARITY_OE"] # To store the ordinal encoded columns
NUMERICAL_COLUMNS = ["CARAT", "DEPTH", "TABLE_", "X", "Y", "Z"]

categories = {
    "CUT": np.array(["Ideal", "Premium", "Very Good", "Good", "Fair"]),
    'CLARITY': np.array(['IF', 'VVS1', 'VVS2', 'VS1', 'VS2', 'SI1', 'SI2','I1','I2','I3']),
    'COLOR': np.array(['D','E','F','G','H','I','J'])
}


In [None]:
# Build the pipeline
# both minmaxscaler and ordinal encoder defined as steps within the pre-processing pipeline
preprocessing_pipeline = Pipeline(
    steps=[
        (
            "OE",
            snowml.OrdinalEncoder(
                input_cols=CATEGORICAL_COLUMNS,
                output_cols=CATEGORICAL_COLUMNS_OE,
                categories=categories,
            )
        ),
        (
            "MMS",
            snowml.MinMaxScaler(
                clip=True,
                input_cols=NUMERICAL_COLUMNS,
                output_cols=NUMERICAL_COLUMNS,
            )
        ),
    ]
)

PIPELINE_FILE = '/tmp/preprocessing_pipeline.joblib'

joblib.dump(preprocessing_pipeline, PIPELINE_FILE) # We are just pickling it locally first

transformed_diamonds_df = preprocessing_pipeline.fit(diamonds_df).transform(diamonds_df)
transformed_diamonds_df



In [None]:
# create the 'models' stage if it does not exist
# this code is taken from this notebook:
#https://github.com/sfc-gh-jgriffith/snowpark-end-to-end-ML-with-hyperparameter-tuning/blob/main/02_snowpark_end_to_end_ml.ipynb

query = """create or replace stage models
           directory = (enable = true)
           copy_options = (on_error='skip_file')"""
session.sql(query).collect()

# get current database details and store in variable
db=session.get_current_database().strip('"')

#create the file path to save to using the stage
# note the squiggly brackets used for the database variable
file_path = f"models"
print(file_path)
#upload the file from the existing temp location to the models stage

#put file to stage from tempfile to persist
session.file.put(PIPELINE_FILE, file_path, overwrite=True)



In [None]:
#declare registry
##registry = Registry(session, database_name="DATASCIENCE", schema_name="PUBLIC")
# Log the model in the registry
# Log the model with metadata - make sure the essential metadata is present including 
# sample data
##registry.log_model(model=preprocessing_pipeline, model_name="pre_process_diamond", version_name="v1",comment="My awesome ML model",metrics={"score": 96},sample_input_data=transformed_diamonds_df)

In [None]:
corr_diamonds_df = correlation(df=transformed_diamonds_df)
corr_diamonds_df


In [None]:
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr_diamonds_df, dtype=bool))

# Create a heatmap with the features
plt.figure(figsize=(7, 7))
heatmap = sns.heatmap(corr_diamonds_df, mask=mask, cmap="YlGnBu", annot=True, vmin=-1, vmax=1)


In [None]:
# Transform the dataframe from a snowflake df to a pandas df and group by PRICE and CARAT
counts = transformed_diamonds_df.to_pandas().groupby(['PRICE', 'CARAT']).size().reset_index(name='Count')

# Plotting
fig, ax = plt.subplots(figsize=(10, 6))
scatter = sns.scatterplot(data=counts, x='CARAT', y='PRICE', size='Count', alpha=0.6)

# Customize plot
ax.grid(axis='y')
#ax.set_xlim([0, 61])

#Move legend and remove spines
scatter.legend(loc='upper left')
sns.despine(left=True, bottom=True)

# Show plot
plt.show()
