# Ingest Zillow
Demonstrates accessing data stored as a file at a specific URL.  The data can be browsed at https://www.zillow.com/research/data/.  The specific URLs can be copied with a right click on the Download button.

In [0]:
import requests
from io import BytesIO
import pandas as pd

# This URL returns a monthly time series of number of sales in the US metro regions.  
# The URL can be determiend by "Copy link" when right clicking the Download button after setting up your desired Data Type and Geography.
url = "https://files.zillowstatic.com/research/public_csvs/sales_count_now/Metro_sales_count_now_uc_sfrcondo_month.csv?t=1757774212"
response = requests.get(url)

# The function raise_for_status throws an exception if the response was not successful.  
response.raise_for_status()

# BytesIO is a file-like object that can be used to read the content of the response.  This enables the data to be read into a Pandas dataframe. 
file_content = BytesIO(response.content)
sales_count_pdf = pd.read_csv(file_content)
display(sales_count_pdf)


In [0]:
# Convert to a PySpark dataframe.
sales_count_df = spark.createDataFrame(sales_count_pdf)

In [0]:
from pyspark.sql.functions import col, sum

# Compute null counts for all columns in one pass
null_counts = sales_count_df.select([
    sum(col(c).isNull().cast("int")).alias(c) for c in sales_count_df.columns
        ]).collect()[0].asDict()

# Get columns where all values are null
row_count = sales_count_df.count()
null_columns = [c for c, n in null_counts.items() if n == row_count]

if len(null_columns) != 0:
    display(pd.DataFrame({"all_null_columns": null_columns}))
    sales_count_df = sales_count_df.drop(*null_columns)

display(sales_count_df)

In [0]:

# Create the destination schema if needed.  The 'bronze' schema aligns to the raw ingestion tier of the medallion architecture.
spark.sql("CREATE SCHEMA IF NOT EXISTS bronze_examples")

# Save in the bronze_examples schema.
sales_count_df.write.mode("overwrite").saveAsTable("bronze_examples.sales_count")
