In [63]:
from pyspark.sql.functions import col, input_file_name, current_timestamp, regexp_extract
import re

StatementMeta(, 3c003e81-058f-46a9-8acb-dbb089acfbc7, 65, Finished, Available, Finished)

In [64]:
# parameter cell to store csv folder location as well as full table name
topfolder = "Files/jd_sharepoint/MLB"
dbschema = "dbo"

StatementMeta(, 3c003e81-058f-46a9-8acb-dbb089acfbc7, 66, Finished, Available, Finished)

In [65]:
# get list of folders in topfolder
foldernames = notebookutils.fs.ls(topfolder)

StatementMeta(, 3c003e81-058f-46a9-8acb-dbb089acfbc7, 67, Finished, Available, Finished)

In [66]:
# clean the column names (replace spaces with underscores) using this function...special chars will break parquet columns
def clean_column_name(name):
    return re.sub(r'[^a-zA-Z0-True0-9]', '_', name)

StatementMeta(, 3c003e81-058f-46a9-8acb-dbb089acfbc7, 68, Finished, Available, Finished)

In [67]:
for f in foldernames:
    tablename = dbschema + '.' + clean_column_name(f.name)
    tablepath = f.path
    print(f"Now processing {tablename}...")
    
    # drop the table if it already exists
    spark.sql(f"drop table if exists {tablename};")
    spark.sql(f"drop table if exists {tablename}_bad;")

    # read data from csv folder, infer the schema, be permissive and add a field indicating whether the data is corrupted
    df = spark.read.format("csv") \
        .option("mode", "PERMISSIVE") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .option("columnNameOfCorruptRecord", "_corrupt_record") \
        .load(tablepath)
    
    # apply the cleaning function to all columns
    new_column_names = [clean_column_name(c) for c in df.columns]
    df = df.toDF(*new_column_names)

    # add metadata columns
    df = df.withColumn("_date_ingested_utc", current_timestamp()) \
        .withColumn("source_file", input_file_name()) \
        .withColumn("_source_file", regexp_extract(col("source_file"), r"(Files\/.*\.csv)", 1)) \
        .drop("source_file")

    # safely handle corrupt rows only if the column exists
    if "_corrupt_record" in df.columns:
        bad_df = df.filter(col("_corrupt_record").isNotNull())
        bad_count = bad_df.count()
        print(f"Corrupt rows found: {bad_count}")
        bad_df.show(20, truncate=False)
        bad_df.write.format("delta").mode("overwrite").saveAsTable(f"{tablename}_bad")
    else:
        print("     No malformed rows were captured (or none occurred)")


    # write df to a table using overwrite (we will overwrite each time as new data is added to the sharepoint folder)
    (
        df
        .write.format("delta")
        .mode("overwrite")
        .saveAsTable(tablename)
    )

    print(f"...{tablename} processed successfully!")

StatementMeta(, 3c003e81-058f-46a9-8acb-dbb089acfbc7, 69, Finished, Available, Finished)

Now processing dbo.allstarfull...
     No malformed rows were captured (or none occurred).
...dbo.allstarfull processed successfully!
Now processing dbo.batting...
     No malformed rows were captured (or none occurred).
...dbo.batting processed successfully!
Now processing dbo.fielding...
     No malformed rows were captured (or none occurred).
...dbo.fielding processed successfully!
Now processing dbo.homegames...
     No malformed rows were captured (or none occurred).
...dbo.homegames processed successfully!
Now processing dbo.parks...
     No malformed rows were captured (or none occurred).
...dbo.parks processed successfully!
Now processing dbo.people...
     No malformed rows were captured (or none occurred).
...dbo.people processed successfully!
Now processing dbo.teams...
     No malformed rows were captured (or none occurred).
...dbo.teams processed successfully!
