In [1]:
import seaborn as sns
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
from snowflake.snowpark import Session
import snowflake.snowpark.functions as F

In [2]:
session = Session.builder.configs(SnowflakeLoginOptions()).getOrCreate()

SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. 


In [9]:
session.sql("CREATE STAGE if not exists ML_DATA").collect()

[Row(status='ML_DATA already exists, statement succeeded.')]

In [4]:
titanic = sns.load_dataset("titanic")
titanic.columns = [c.upper() for c in titanic.columns]
titanic.to_csv("titanic.csv", index=False)

In [5]:
session.file.put("titanic.csv", "@ml_data", overwrite=True)

[PutResult(source='titanic.csv', target='titanic.csv.gz', source_size=57018, target_size=6528, source_compression='NONE', target_compression='GZIP', status='UPLOADED', message='')]

In [6]:
# Create a Snowpark DataFrame that is configured to load data from the CSV file
titanic_df = (
    session.read.option("infer_schema", True)
    .option("PARSE_HEADER", True)
    .csv("@ml_data/titanic.csv")
)
titanic_df.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"SURVIVED"  |"PCLASS"  |"SEX"   |"AGE"  |"SIBSP"  |"PARCH"  |"FARE"   |"EMBARKED"  |"CLASS"  |"WHO"  |"ADULT_MALE"  |"DECK"  |"EMBARK_TOWN"  |"ALIVE"  |"ALONE"  |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------
|0           |3         |male    |22.00  |1        |0        |7.2500   |S           |Third    |man    |True          |NULL    |Southampton    |False    |False    |
|1           |1         |female  |38.00  |1        |0        |71.2833  |C           |First    |woman  |False         |C       |Cherbourg      |True     |False    |
|1           |3         |female  |26.00  |0        |0        |7.9250   |S           |Third    |woman  |False         |NULL    |Southampton    |True     |True     |
|1           |1 

In [7]:
def fix_values(columnn):
    return F.upper(F.regexp_replace(F.col(columnn), '[^a-zA-Z0-9]+', '_'))

for col in ["SEX", "EMBARKED", "CLASS", "WHO", "EMBARK_TOWN"]:
    titanic_df = titanic_df.with_column(col, fix_values(col))

titanic_df.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"SURVIVED"  |"PCLASS"  |"AGE"  |"SIBSP"  |"PARCH"  |"FARE"   |"ADULT_MALE"  |"DECK"  |"ALIVE"  |"ALONE"  |"SEX"   |"EMBARKED"  |"CLASS"  |"WHO"  |"EMBARK_TOWN"  |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------
|0           |3         |22.00  |1        |0        |7.2500   |True          |NULL    |False    |False    |MALE    |S           |THIRD    |MAN    |SOUTHAMPTON    |
|1           |1         |38.00  |1        |0        |71.2833  |False         |C       |True     |False    |FEMALE  |C           |FIRST    |WOMAN  |CHERBOURG      |
|1           |3         |26.00  |0        |0        |7.9250   |False         |NULL    |True     |True     |FEMALE  |S           |THIRD    |WOMAN  |SOUTHAMPTON    |
|1           |1 

In [8]:
titanic_df.write.mode("overwrite").save_as_table("titanic")