In [None]:
#this integration is being created to run the kaggle diamond dataset machine learning example (dataset here in kaggle):
#https://www.kaggle.com/datasets/shivam2503/diamonds?resource=download
#this notenook is based on a training session which was run
#by Snowflake SME (subject matter expert) Martin Thorup
# who was teaching the capablities of Snowpark

# Snowpark for Python
import snowflake.snowpark.functions as F
from snowflake.snowpark.types import DecimalType

import numpy as np
# Override np.float_ with np.float64
np.float_ = np.float64

# Snowpark ML
import snowflake.ml.modeling.preprocessing as snowml
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.metrics.correlation import correlation

# Data science libs
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import json
import joblib

#warning suppression
import warnings; warnings.simplefilter('ignore')





In [None]:
# Get active session (current snowflake session)
from snowflake.snowpark.context import get_active_session
session = get_active_session()

# add a query tag to the session 
session.query_tag = {"origin":"sf_sit-is","name":"e2e_ml_snowparkpython", "version":{"major":1,"minor":0}}

# Set session context
session.use_role("ACCOUNTADMIN")

# get current solution prefix from warehouse name
solution_prefix = session.get_current_warehouse()
#.strip("_").split("_DS_WH")[0]

# Get the current role, warehouse, and database/schema
print(f"Current role: {session.get_current_role()} | Current warehouse: {session.get_current_warehouse()} | DB SCHEMA: {session.sql('select current_database(), current_schema()').collect()}")



In [None]:
# Data Loading
diamonds_df = session.table('DIAMONDS')
diamonds_df

In [None]:
# Normalize the CARAT column

snowml_mms = snowml.MinMaxScaler(input_cols=['CARAT'], output_cols=['CARAT_NORM'])
normalized_diamonds_df = snowml_mms.fit(diamonds_df).transform(diamonds_df)
new_col = normalized_diamonds_df.col("CARAT_NORM").cast(DecimalType(7,6))
normalized_diamonds_df.withColumn("CARAT_NORM", new_col)
normalized_diamonds_df

In [None]:
# Encode CUT and CLARITY preserve ordinal importance
#define the categories for each of the columns in the encoder (put them in the order you wish them to be numbered)
# note that 0 is the lowest number in sequence 
categories = {
   "CUT": np.array(["Ideal", "Premium", "Very Good", "Good", "Fair"]),
   "CLARITY": np.array(["IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1", "I2", "I3"]),
}

snowml_oe = snowml.OrdinalEncoder(input_cols=["CUT", "CLARITY"], output_cols=["CUT_OE", "CLARITY_OE"], categories=categories)
# fit the dataframe to the ordinal encoder and generate the output columns
ord_encoded_diamonds_df = snowml_oe.fit(normalized_diamonds_df).transform(normalized_diamonds_df)


print("Result Dataframe:\n")
ord_encoded_diamonds_df