Connect to Snowflake

In [1]:
from snowflake.snowpark import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions

session = Session.builder.configs(SnowflakeLoginOptions("test_conn")).create()
STAGE = "@TEST.PUBLIC.ENC_STAGE"

SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. 


Create a FileSet from a DataFrame

In [2]:
from snowflake.ml.fileset import fileset

session.sql(f"REMOVE {STAGE}/housing1").collect()
fs1 = fileset.FileSet.make(
    target_stage_loc=STAGE,
    name="housing1",
    snowpark_dataframe=session.table('TEST.PUBLIC.HOUSING'),
    shuffle=True)
fs1.files()



['sfc://@TEST.PUBLIC.ENC_STAGE/housing1/data_01b3efd4-0002-5aa8-005b-3f070006b7e2_013_3_0.snappy.parquet']

Create a FileSet from a query result

In [3]:
session.sql(f"REMOVE {STAGE}/housing2").collect()
fs2 = fileset.FileSet.make(
    target_stage_loc=STAGE,
    name="housing2",
    sf_connection=session.connection,
    query="select * from TEST.PUBLIC.HOUSING",
    shuffle=True)
fs2.files()

['sfc://@TEST.PUBLIC.ENC_STAGE/housing2/data_01b3efd4-0002-5b19-005b-3f070006887a_013_4_0.snappy.parquet']

Feed first FileSet to TensorFlow

In [4]:
ds1 = fs1.to_tf_dataset(batch_size=4, shuffle=True, drop_last_batch=True)
for batch in ds1:
    print(batch)
    break



{'LONGITUDE': <tf.Tensor: shape=(4,), dtype=float32, numpy=array([-121.96, -118.29, -122.2 , -118.27], dtype=float32)>, 'LATITUDE': <tf.Tensor: shape=(4,), dtype=float32, numpy=array([37.27, 33.91, 37.76, 34.25], dtype=float32)>, 'HOUSING_MEDIAN_AGE': <tf.Tensor: shape=(4,), dtype=float32, numpy=array([31., 41., 37., 37.], dtype=float32)>, 'TOTAL_ROOMS': <tf.Tensor: shape=(4,), dtype=float32, numpy=array([3347., 2475., 2680., 2489.], dtype=float32)>, 'TOTAL_BEDROOMS': <tf.Tensor: shape=(4,), dtype=float32, numpy=array([589., 532., 736., 454.], dtype=float32)>, 'POPULATION': <tf.Tensor: shape=(4,), dtype=float32, numpy=array([1566., 1416., 1925., 1215.], dtype=float32)>, 'HOUSEHOLDS': <tf.Tensor: shape=(4,), dtype=float32, numpy=array([597., 470., 667., 431.], dtype=float32)>, 'MEDIAN_INCOME': <tf.Tensor: shape=(4,), dtype=float32, numpy=array([5.5151, 3.8372, 1.4097, 5.0234], dtype=float32)>, 'MEDIAN_HOUSE_VALUE': <tf.Tensor: shape=(4,), dtype=float32, numpy=array([286800., 156400.,  8

Feed second FileSet to PyTorch

In [5]:
from torch.utils.data import DataLoader

ds2 = fs2.to_torch_datapipe(batch_size=4, shuffle=True, drop_last_batch=True)
loader = DataLoader(ds2, batch_size=None, num_workers=0)
for batch in loader:
    print(batch)
    break



ImportError: (0000) Unable to import torchdata.datapipes.iter.IterableWrapper.