Refer to Apache Spark documentation on CSV files: https://spark.apache.org/docs/3.5.4/sql-data-sources-csv.html

In [0]:
from pyspark.sql.types import *

## Reading in a CSV file without specifying the schema

In [0]:
volume_path = "/Volumes/de_with_databricks/bronze/ingest"
acs_profile_csv_path = f"{volume_path}/ACS_Profile_Data_2022.csv"
acs_profile_df = spark.read.\
    options(delimiter=",", header=True)\
    .csv(acs_profile_csv_path)
display(acs_profile_df)

In [0]:
acs_profile_df.schema

## Reading in CSV file using a schema definition

In [0]:
asc_profile_schema = StructType(
        [StructField('level', StringType(), True),
         StructField('State_ID', StringType(), True),
         StructField('state', StringType(), True),
         StructField('FIPS', StringType(), True),
         StructField('county', StringType(), True),
         StructField('measure', StringType(), True),
         StructField('year', IntegerType(), True),
         StructField('period', StringType(), True),
         StructField('numerator', StringType(), True),
         StructField('denominator', StringType(), True),
         StructField('value', DoubleType(), True),
         StructField('label', StringType(), True)]
    )

acs_profile_df = spark.read.\
    options(delimiter=",", header=True)\
    .csv(acs_profile_csv_path, schema=asc_profile_schema)

# You can also use the following:
#  acs_profile_df = spark.read.format("csv")\
#     .option("header","true")\
#     .load(acs_profile_csv_path, schema=asc_profile_schema)

display(acs_profile_df.limit(10))

In [0]:
acs_profile_df.schema

In [0]:
%sh
head -n 4 /Volumes/de_with_databricks/bronze/ingest/ACS_Profile_Data_2022.csv


## Removing the sub-header row from the ASC Profile CSV file - Method 1

In [0]:
acs_profile_skip_rows_df = spark.read.format("csv")\
    .option("skipRows", 2)\
    .load(acs_profile_csv_path, schema=asc_profile_schema)
display(acs_profile_skip_rows_df.limit(10))


## Removing the sub-header row from the ASC Profile CSV file - Method 2

In [0]:
from pyspark.sql.functions import monotonically_increasing_id

acs_profile_remove_rows_df = spark.read.\
    options(delimiter=",", header=True)\
    .csv(acs_profile_csv_path, schema=asc_profile_schema)\
    .withColumn('id', monotonically_increasing_id())
rows_to_remove = [0]
acs_profile_remove_rows_df = acs_profile_remove_rows_df\
    .filter(~acs_profile_remove_rows_df.id.isin(rows_to_remove))

In [0]:
display(acs_profile_remove_rows_df.limit(10))

In [0]:
acs_profile_df = acs_profile_remove_rows_df
acs_profile_df.write.partitionBy("state", "county").mode("overwrite").saveAsTable("de_with_databricks.bronze.acs_profile1")

In [0]:
%sql
SELECT * FROM de_with_databricks.bronze.acs_profile1 LIMIT 10

In [0]:
acs_profile_df.createOrReplaceTempView("acs_profile_view")

In [0]:
%sql
CREATE TABLE de_with_databricks.bronze.acs_profile2 PARTITIONED BY (state, county) AS 
SELECT * FROM acs_profile_view;

In [0]:
%sql
SELECT * FROM  de_with_databricks.bronze.acs_profile2 LIMIT 10

In [0]:
create_table_sql = """
CREATE TABLE de_with_databricks.bronze.acs_profile3
PARTITIONED BY (state, county) AS 
SELECT * FROM acs_profile_view;
"""
spark.sql(create_table_sql)

In [0]:
%sql
SELECT * FROM de_with_databricks.bronze.acs_profile3 LIMIT 10