Connect to Snowflake w/ the Python Connector and SNOWSQL config file data

In [1]:
# see https://github.com/Snowflake-Labs/sfguide-snowpark-scikit-learn/blob/main/2_data_exploration_transformation.ipynb

import os, sys, configparser
import numpy as np
from IPython.display import display
import snowflake.connector

parser = configparser.ConfigParser()
parser.read(os.path.join(os.path.expanduser('~'), ".snowsql/config"))
section = "connections.test_conn"
conn = snowflake.connector.connect(
    account=parser.get(section, "accountname"),
    user=parser.get(section, "username"),
    password=parser.get(section, "password"),
    database=parser.get(section, "database"),
    schema=parser.get(section, "schema"))

Load all HOUSING table records in memory

In [2]:
query = "SELECT * FROM test.public.housing"
df = conn.cursor().execute(query).fetch_pandas_all()
size = np.round(sys.getsizeof(df) / (1024.0**2), 2)
print(f'Memory: {size} MB')
display(df)

Memory: 2.7 MB


Unnamed: 0,LONGITUDE,LATITUDE,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,POPULATION,HOUSEHOLDS,MEDIAN_INCOME,MEDIAN_HOUSE_VALUE,OCEAN_PROXIMITY
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


Add calculated column and select some columns

In [3]:
df['BEDROOM_RATIO'] = df['TOTAL_BEDROOMS'] / df['TOTAL_ROOMS']
df = df[['HOUSING_MEDIAN_AGE', 'TOTAL_ROOMS', 'TOTAL_BEDROOMS', 'HOUSEHOLDS', 'OCEAN_PROXIMITY', 'BEDROOM_RATIO']]
display(df)

Unnamed: 0,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,HOUSEHOLDS,OCEAN_PROXIMITY,BEDROOM_RATIO
0,41.0,880.0,129.0,126.0,NEAR BAY,0.146591
1,21.0,7099.0,1106.0,1138.0,NEAR BAY,0.155797
2,52.0,1467.0,190.0,177.0,NEAR BAY,0.129516
3,52.0,1274.0,235.0,219.0,NEAR BAY,0.184458
4,52.0,1627.0,280.0,259.0,NEAR BAY,0.172096
...,...,...,...,...,...,...
20635,25.0,1665.0,374.0,330.0,INLAND,0.224625
20636,18.0,697.0,150.0,114.0,INLAND,0.215208
20637,17.0,2254.0,485.0,433.0,INLAND,0.215173
20638,18.0,1860.0,409.0,349.0,INLAND,0.219892


Drop calculated column

In [4]:
df = df.drop(columns=['BEDROOM_RATIO'])
display(df)

Unnamed: 0,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,HOUSEHOLDS,OCEAN_PROXIMITY
0,41.0,880.0,129.0,126.0,NEAR BAY
1,21.0,7099.0,1106.0,1138.0,NEAR BAY
2,52.0,1467.0,190.0,177.0,NEAR BAY
3,52.0,1274.0,235.0,219.0,NEAR BAY
4,52.0,1627.0,280.0,259.0,NEAR BAY
...,...,...,...,...,...
20635,25.0,1665.0,374.0,330.0,INLAND
20636,18.0,697.0,150.0,114.0,INLAND
20637,17.0,2254.0,485.0,433.0,INLAND
20638,18.0,1860.0,409.0,349.0,INLAND


Filter data

In [5]:
#df = (df[(df['OCEAN_PROXIMITY'] == 'INLAND')
#    | (df['OCEAN_PROXIMITY'] == 'ISLAND')
#    | (df['OCEAN_PROXIMITY'] == 'NEAR BAY')])
filter = df['OCEAN_PROXIMITY'].isin(['INLAND', 'ISLAND', 'NEAR BAY'])
df = df[filter]
display(df)

Unnamed: 0,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,HOUSEHOLDS,OCEAN_PROXIMITY
0,41.0,880.0,129.0,126.0,NEAR BAY
1,21.0,7099.0,1106.0,1138.0,NEAR BAY
2,52.0,1467.0,190.0,177.0,NEAR BAY
3,52.0,1274.0,235.0,219.0,NEAR BAY
4,52.0,1627.0,280.0,259.0,NEAR BAY
...,...,...,...,...,...
20635,25.0,1665.0,374.0,330.0,INLAND
20636,18.0,697.0,150.0,114.0,INLAND
20637,17.0,2254.0,485.0,433.0,INLAND
20638,18.0,1860.0,409.0,349.0,INLAND


Aggregate & sort data

In [6]:
df = df.groupby(['OCEAN_PROXIMITY'])['HOUSEHOLDS'].agg(AVG_HOUSEHOLDS='mean')
df = df.reset_index(drop=True)
df = df.sort_values('AVG_HOUSEHOLDS')
display(df)

Unnamed: 0,AVG_HOUSEHOLDS
1,276.6
0,477.447565
2,488.616157


Dump pandas DataFrame into a Snowflake table

In [7]:
# see https://community.snowflake.com/s/article/How-to-use-Write-Pandas-method-to-create-a-table-when-it-does-not-exist
from snowflake.connector.pandas_tools import write_pandas
write_pandas(conn, df, 'HOUSING_PANDAS', auto_create_table=True, overwrite=True)

  write_pandas(conn, df, 'HOUSING_PANDAS', auto_create_table=True, overwrite=True)


(True,
 1,
 3,
 [('ezpwmrsoar/file0.txt', 'LOADED', 3, 3, 1, 0, None, None, None, None)])