# 01 - Load Demo Data into Snowflake

In [1]:
from snowflake.snowpark.session import Session
from snowflake.snowpark import version
import json
print(version.VERSION)

(0, 7, 0)


In [2]:
snowflake_session_prop = json.load(open('session.json'))

In [3]:
session = Session.builder.configs(snowflake_session_prop).create()
session.sql("use role accountadmin").collect()
session.sql("create database if not exists  {}".format(snowflake_session_prop['database'])).collect()
session.sql("use database {}".format(snowflake_session_prop['database'])).collect()
session.sql("create schema if not exists {}".format(snowflake_session_prop['schema'])).collect()
session.sql("use schema {}".format(snowflake_session_prop['schema'])).collect()
# session.sql("create or replace warehouse {} with \
#                 WAREHOUSE_SIZE = XSMALL \
#                 AUTO_SUSPEND = 120 \
#                 AUTO_RESUME = TRUE".format(snowflake_session_prop['warehouse'])).collect()
session.sql("use warehouse {}".format(snowflake_session_prop['warehouse']))
print(session.sql('select current_warehouse(), current_database(), current_schema()').collect())

[Row(CURRENT_WAREHOUSE()='COMPUTE_WH', CURRENT_DATABASE()='SNOWPARK_OKERA_QUICKSTART', CURRENT_SCHEMA()='MEDICAL')]


In [4]:
filename = "costs-per-customer.csv"
stagename = "MEDICAL_COSTS_STAGE"

In [5]:
session.sql(f"create or replace table COSTS ( \
        id integer, \
        age integer, \
        sex string, \
        bmi number (10,1), \
        children integer, \
        smoker string, \
        region string, \
        costs number (10,2), \
        phone string, \
        recordnum int, \
        optout integer);").collect()

[Row(status='Table COSTS successfully created.')]

In [6]:
# Create stage objects
session.sql(f"create or replace file format mycsvformat type = 'CSV' skip_header = 1;").collect()

[Row(status='File format MYCSVFORMAT successfully created.')]

In [7]:
# Stage the data files
session.sql(f"create or replace stage {stagename} file_format = mycsvformat DIRECTORY = (ENABLE = TRUE);").collect()
session.file.put(filename,stagename)

[PutResult(source='costs-per-customer.csv', target='costs-per-customer.csv.gz', source_size=76151, target_size=29696, source_compression='NONE', target_compression='GZIP', status='UPLOADED', message='')]

In [8]:
# List the staged files
session.sql(f"list @{stagename};").collect()

[Row(name='medical_costs_stage/costs-per-customer.csv.gz', size=29696, md5='254b11cc6657eaf34828830be740133e', last_modified='Mon, 4 Jul 2022 22:14:25 GMT')]

In [9]:
session.sql(f"copy into COSTS from @{stagename}/{filename}.gz file_format = (format_name = mycsvformat) on_error = 'skip_file';").collect()

[Row(file='medical_costs_stage/costs-per-customer.csv.gz', status='LOADED', rows_parsed=1338, rows_loaded=1338, error_limit=1, errors_seen=0, first_error=None, first_error_line=None, first_error_character=None, first_error_column_name=None)]

# 02 - Lightweight Feature Engineering

In [10]:
from snowflake.snowpark.functions import col,array_construct

import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder

# misc
import json

In [11]:
# Filter out data where the customer has chosen to opt out
snow_df_costs = session.table('COSTS').filter(col('OPTOUT') != 1)

print('Row count after filtering opt-out     : ',snow_df_costs.count())
snow_df_costs.show()

Row count after filtering opt-out     :  1256
--------------------------------------------------------------------------------------------------------------------
|"ID"  |"AGE"  |"SEX"   |"BMI"  |"CHILDREN"  |"SMOKER"  |"REGION"  |"COSTS"   |"PHONE"    |"RECORDNUM"  |"OPTOUT"  |
--------------------------------------------------------------------------------------------------------------------
|0     |19     |female  |27.9   |0           |yes       |EMEA      |16884.92  |447686636  |8681199      |0         |
|1     |18     |male    |33.8   |1           |no        |EMEA      |1725.55   |437519962  |4379756      |0         |
|2     |28     |male    |33.0   |3           |no        |EMEA      |4449.46   |381988151  |9786757      |0         |
|3     |33     |male    |22.7   |0           |no        |NA        |21984.47  |556619672  |8164839      |0         |
|4     |32     |male    |28.9   |0           |no        |NA        |3866.86   |497874796  |7341375      |0         |
|5     |31     |fe

In [12]:
# Drop rows with missing values if any
snow_df_costs = snow_df_costs.dropna()

# Exclude columns we don't need for the model
snow_df_costs = snow_df_costs.drop(['OPTOUT', 'ID', 'RECORDNUM','PHONE'])

# Show intermediate transformed dataframe
print('Intermediate dataframe                : ',snow_df_costs.count())
snow_df_costs.show()

Intermediate dataframe                :  1256
------------------------------------------------------------------------
|"AGE"  |"SEX"   |"BMI"  |"CHILDREN"  |"SMOKER"  |"REGION"  |"COSTS"   |
------------------------------------------------------------------------
|19     |female  |27.9   |0           |yes       |EMEA      |16884.92  |
|18     |male    |33.8   |1           |no        |EMEA      |1725.55   |
|28     |male    |33.0   |3           |no        |EMEA      |4449.46   |
|33     |male    |22.7   |0           |no        |NA        |21984.47  |
|32     |male    |28.9   |0           |no        |NA        |3866.86   |
|31     |female  |25.7   |0           |no        |EMEA      |3756.62   |
|46     |female  |33.4   |1           |no        |EMEA      |8240.59   |
|37     |female  |27.7   |3           |no        |NA        |7281.51   |
|37     |male    |29.8   |2           |no        |NA        |6406.41   |
|60     |female  |25.8   |0           |no        |NA        |28923.14  |
-----

In [13]:
# Load the Snowpark dataframe into Pandas for further processing
pd_df = snow_df_costs.to_pandas()

# Use sklearn.preprocessing.LabelEncoder to convert categorical data to numbers
# Example: transforms no | yes to 0 | 1

le = LabelEncoder()

pd_df['SEX'] = le.fit_transform(pd_df['SEX'])
pd_df['SMOKER'] = le.fit_transform(pd_df['SMOKER'])
pd_df['REGION'] = le.fit_transform(pd_df['REGION'])

# Show final dataframe
print('Dataframe shape -   rows, columns     : ', pd_df.shape)
pd_df.head()

Dataframe shape -   rows, columns     :  (1256, 7)


Unnamed: 0,AGE,SEX,BMI,CHILDREN,SMOKER,REGION,COSTS
0,19,0,27.9,0,1,0,16884.92
1,18,1,33.8,1,0,0,1725.55
2,28,1,33.0,3,0,0,4449.46
3,33,1,22.7,0,0,1,21984.47
4,32,1,28.9,0,0,1,3866.86


In [14]:
# Convert Pandas dataframe to Snowpark dataframe
snow_df_costs = session.create_dataframe(pd.DataFrame(pd_df))

In [15]:
# Save training data in Snowflake table
snow_df_costs.write.mode("overwrite").save_as_table("COSTS_TRAINING")