In [None]:
#Snowpark for Python
from snowflake.snowpark.version import VERSION
from snowflake.snowpark.functions import udf
import snowflake.snowpark.functions as F

import numpy as np
#Override np.float_ with np.float64
np.float_ = np.float64

from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.registry import Registry
from snowflake.ml._internal.utils import identifier

# data science libs
import pandas as pd  
# need to add numpy code from previous notebook to handle int64 issue for numpy
import matplotlib.pyplot as plt 
import seaborn as sns 

from snowflake.ml.modeling.metrics import mean_absolute_percentage_error 

# other libs 
import json
import joblib 
import cachetools 

# warning suppression 
import warnings; warnings.simplefilter('ignore')


In [None]:
# Get active session (current snowflake session)
from snowflake.snowpark.context import get_active_session
session = get_active_session()

# add a query tag to the session 
session.query_tag = {"origin":"sf_sit-is","name":"e2e_ml_snowparkpython", "version":{"major":1,"minor":0}}

# Set session context
session.use_role("ACCOUNTADMIN")

# get current solution prefix from warehouse name
solution_prefix = session.get_current_warehouse()
#.strip("_").split("_DS_WH")[0]

# Get the current role, warehouse, and database/schema
print(f"Current role: {session.get_current_role()} | Current warehouse: {session.get_current_warehouse()} | DB SCHEMA: {session.sql('select current_database(), current_schema()').collect()}")

In [None]:
# Data Loading
# note that by default this is a snowpark/snowflake data frame
diamonds_df = session.table('DIAMONDS')
diamonds_df

In [None]:
#strip double quotes from column names

# Function to strip double quotes from column names
def strip_double_quotes_from_column_names(df):
    new_columns = [col.replace('"', '') for col in df.columns]
    return df.to_df(*new_columns)

# Apply the function to the DataFrame
diamonds_df = strip_double_quotes_from_column_names(diamonds_df)


In [None]:
# Categorize all the features for processing
CATEGORICAL_COLUMNS = ["CUT", "COLOR", "CLARITY"]
CATEGORICAL_COLUMNS_OE = ["CUT_OE", "COLOR_OE", "CLARITY_OE"] # To store the ordinal encoded columns
NUMERICAL_COLUMNS = ["CARAT", "DEPTH", "X", "Y", "Z"]

LABEL_COLUMNS = ['PRICE']
OUTPUT_COLUMNS = ['PREDICTED_PRICE']


In [None]:
# load the preprocessing model which alreadt exists in the model registry
# model_registry = Registry(session, database_name="DATASCIENCE", schema_name="PUBLIC")
# preprocessing_pipeline = model_registry.get_model('pre_process_diamond')


In [None]:
#session.use_database(f"{solution_prefix}_PROD")
#session.use_schema("ANALYTICS")
db=session.get_current_database()
#.strip('')

# Construct the file path using the solution_prefix
file_path = f"@{db}.PUBLIC.models/preprocessing_pipeline.joblib.gz"

session.file.get(file_path, '/tmp')
PIPELINE_FILE = "/tmp/preprocessing_pipeline.joblib.gz"
preprocessing_pipeline = joblib.load(PIPELINE_FILE)


In [None]:

diamonds_train_df, diamonds_test_df = diamonds_df.random_split(weights=[0.9, 0.1], seed=0)
train_df = preprocessing_pipeline.fit(diamonds_train_df).transform(diamonds_train_df)
# apply the preprocessing pipeline to the training and test data frames
#train_df = preprocessing_pipeline.transform(diamonds_train_df)
test_df = preprocessing_pipeline.transform(diamonds_test_df)



In [None]:
# create the model - a regression ML model from the XGBoost ML library 
regressor = XGBRegressor(
    input_cols=CATEGORICAL_COLUMNS_OE + NUMERICAL_COLUMNS,
    label_cols=LABEL_COLUMNS,
    output_cols=OUTPUT_COLUMNS
)
#train the model
regressor.fit(train_df)
#do the prediction with the model and put the prediction into a snowpark dataframe
result = regressor.predict(test_df)


In [None]:
# using the same model do another prediction into a pandas dataframe
regressor.predict(test_df[CATEGORICAL_COLUMNS_OE+NUMERICAL_COLUMNS].to_pandas())
