Connect to Snowflake w/ the Python Connector and SNOWSQL config file data

In [13]:
# see https://github.com/Snowflake-Labs/sfguide-snowpark-scikit-learn/blob/main/2_data_exploration_transformation.ipynb
import os, sys, configparser
import numpy as np
import snowflake.snowpark.functions as F
from snowflake.snowpark import Session

parser = configparser.ConfigParser()
parser.read(os.path.join(os.path.expanduser('~'), ".snowsql/config"))
section = "connections.test_conn"
pars = {
    "account": parser.get(section, "accountname"),
    "user": parser.get(section, "username"),
    "password": parser.get(section, "password"),
    "database": parser.get(section, "database"),
    "schema": parser.get(section, "schema")}

session = Session.builder.configs(pars).create()
print(session.sql('select current_warehouse(), current_database(), current_schema()').collect())

[Row(CURRENT_WAREHOUSE()='COMPUTE_WH', CURRENT_DATABASE()='TEST', CURRENT_SCHEMA()='PUBLIC')]


Connect to the HOUSING table (but nothing loaded!)

In [14]:
df = session.table('HOUSING')
size = np.round(sys.getsizeof(df) / (1024.0**2), 2)
print(f'Memory: {size} MB')
df.show()
df.queries

Memory: 0.0 MB
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"LONGITUDE"  |"LATITUDE"  |"HOUSING_MEDIAN_AGE"  |"TOTAL_ROOMS"  |"TOTAL_BEDROOMS"  |"POPULATION"  |"HOUSEHOLDS"  |"MEDIAN_INCOME"  |"MEDIAN_HOUSE_VALUE"  |"OCEAN_PROXIMITY"  |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|-122.23      |37.88       |41.0                  |880.0          |129.0             |322.0         |126.0         |8.3252           |452600.0              |NEAR BAY           |
|-122.22      |37.86       |21.0                  |7099.0         |1106.0            |2401.0        |1138.0        |8.3014           |358500.0              |NEAR BAY           |
|-122.24      |37.85       |52.0                  |1467.0         |190.0             |496.0    

{'queries': ['SELECT  *  FROM (HOUSING)'], 'post_actions': []}

Add calculated column and select some columns

In [15]:
df = df.with_column('BEDROOM_RATIO', F.col('TOTAL_BEDROOMS') / F.col('TOTAL_ROOMS'))
df = df.select('HOUSING_MEDIAN_AGE','TOTAL_ROOMS','TOTAL_BEDROOMS','HOUSEHOLDS','OCEAN_PROXIMITY', 'BEDROOM_RATIO')
df.show()

----------------------------------------------------------------------------------------------------------------
|"HOUSING_MEDIAN_AGE"  |"TOTAL_ROOMS"  |"TOTAL_BEDROOMS"  |"HOUSEHOLDS"  |"OCEAN_PROXIMITY"  |"BEDROOM_RATIO"  |
----------------------------------------------------------------------------------------------------------------
|41.0                  |880.0          |129.0             |126.0         |NEAR BAY           |0.1465909        |
|21.0                  |7099.0         |1106.0            |1138.0        |NEAR BAY           |0.1557966        |
|52.0                  |1467.0         |190.0             |177.0         |NEAR BAY           |0.1295160        |
|52.0                  |1274.0         |235.0             |219.0         |NEAR BAY           |0.1844584        |
|52.0                  |1627.0         |280.0             |259.0         |NEAR BAY           |0.1720959        |
|52.0                  |919.0          |213.0             |193.0         |NEAR BAY           |0.

Drop calculated column

In [16]:
df = df.drop('BEDROOM_RATIO')
df.show()

----------------------------------------------------------------------------------------------
|"HOUSING_MEDIAN_AGE"  |"TOTAL_ROOMS"  |"TOTAL_BEDROOMS"  |"HOUSEHOLDS"  |"OCEAN_PROXIMITY"  |
----------------------------------------------------------------------------------------------
|41.0                  |880.0          |129.0             |126.0         |NEAR BAY           |
|21.0                  |7099.0         |1106.0            |1138.0        |NEAR BAY           |
|52.0                  |1467.0         |190.0             |177.0         |NEAR BAY           |
|52.0                  |1274.0         |235.0             |219.0         |NEAR BAY           |
|52.0                  |1627.0         |280.0             |259.0         |NEAR BAY           |
|52.0                  |919.0          |213.0             |193.0         |NEAR BAY           |
|52.0                  |2535.0         |489.0             |514.0         |NEAR BAY           |
|52.0                  |3104.0         |687.0     

Filter data

In [17]:
df = df.filter(F.col('OCEAN_PROXIMITY').in_(['INLAND','ISLAND', 'NEAR BAY']))
df.show()
df.queries

----------------------------------------------------------------------------------------------
|"HOUSING_MEDIAN_AGE"  |"TOTAL_ROOMS"  |"TOTAL_BEDROOMS"  |"HOUSEHOLDS"  |"OCEAN_PROXIMITY"  |
----------------------------------------------------------------------------------------------
|41.0                  |880.0          |129.0             |126.0         |NEAR BAY           |
|21.0                  |7099.0         |1106.0            |1138.0        |NEAR BAY           |
|52.0                  |1467.0         |190.0             |177.0         |NEAR BAY           |
|52.0                  |1274.0         |235.0             |219.0         |NEAR BAY           |
|52.0                  |1627.0         |280.0             |259.0         |NEAR BAY           |
|52.0                  |919.0          |213.0             |193.0         |NEAR BAY           |
|52.0                  |2535.0         |489.0             |514.0         |NEAR BAY           |
|52.0                  |3104.0         |687.0     

{'queries': ['SELECT "HOUSING_MEDIAN_AGE", "TOTAL_ROOMS", "TOTAL_BEDROOMS", "HOUSEHOLDS", "OCEAN_PROXIMITY" FROM HOUSING WHERE "OCEAN_PROXIMITY" IN (\'INLAND\', \'ISLAND\', \'NEAR BAY\')'],
 'post_actions': []}

Aggregate & sort data

In [18]:
df = df.group_by(['OCEAN_PROXIMITY']).agg([F.avg('HOUSEHOLDS').as_('AVG_HOUSEHOLDS')])
df = df.sort(F.col('AVG_HOUSEHOLDS').asc())
df.show()

----------------------------------------
|"OCEAN_PROXIMITY"  |"AVG_HOUSEHOLDS"  |
----------------------------------------
|ISLAND             |276.6000000       |
|INLAND             |477.4475653       |
|NEAR BAY           |488.6161572       |
----------------------------------------



Save Snowpark DataFrame into a table + as a pandas DataFrame

In [22]:
df.write.mode("overwrite").save_as_table("HOUSING_SNOWPARK")
dfp = df.to_pandas()

from IPython.display import display
display(dfp)

Unnamed: 0,OCEAN_PROXIMITY,AVG_HOUSEHOLDS
0,ISLAND,276.6
1,INLAND,477.4475653
2,NEAR BAY,488.6161572
