# NOTEBOOK 1: INGEST HOUSING DATA INTO SNOWFLAKE

#### Download the housing dataset

In [None]:
# We will load some data from internet which we will then load into Snowflake 
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()

In [None]:
# Import Snowpark libraries 
from snowflake.snowpark.session import Session
from snowflake.snowpark import functions as F
from snowflake.snowpark.functions import udf
from snowflake.snowpark.types import *

#Snowflake connection info is saved in config.py
# Please change config.py to point to your Snowflake instance before running this step
# Use user which has account admin role or equivalent to create DB, Schema, warehouse etc.

from config import snowflake_conn_prop

import pandas as pd 

def load_housing_data ( housing_path = HOUSING_PATH ): 
    csv_path = os.path.join ( housing_path , "housing.csv" ) 
    return pd.read_csv ( csv_path ) 

housing = load_housing_data()
housing.head()

Let's configure our Snowpark Session and initialize the database, warehouse, and schema that we will use for the remainder of the quickstart.

In [None]:
import sys
sys.path.append('..')
from utilities.creds import Credentials
from snowflake.snowpark import version
print(version.VERSION)

session = Session.builder.configs(Credentials().__dict__).create()

session.use_role("LEARNINGSNOWPARKROLE")
session.use_database("SCIKIT_LEARN")
session.use_schema("SCIKIT_LEARN.PUBLIC")
session.use_warehouse("LEARNINGSNOWPARKVW")

print(session.sql('select current_warehouse(), current_database(), current_schema()').collect())

### Write the data into a Snowflake table named `HOUSING_DATA`

Use the **write_pandas** method to create a table with the data in the Pandas dataframe, housing. The table will be automatically created, if it exists it will be dropped and recreated (auto_create_table and overwrite controls this)

In [None]:
snowpark_df = session.write_pandas(housing, "HOUSING_DATA", quote_identifiers=False, auto_create_table=True, overwrite=True)

Check that we have data in the table

In [None]:
snowpark_df.show()

In [None]:
session.close()