In [0]:
dbutils.library.restartPython() 

#### Connect to storage & load libraries

In [0]:
ServicePrincipalID = dbutils.secrets.get(scope="data-bricks", key="SPID")
ServicePrincipalKey = dbutils.secrets.get(scope="data-bricks", key="SPKey")
DirectoryID = dbutils.secrets.get(scope="data-bricks", key="DirectoryID")

Directory = f"https://login.microsoftonline.com/{DirectoryID}/oauth2/token"

container_name = 'my-csv'
storage_account_name = 'mainstorageaccountv2'

url=f'abfss://{container_name}@{storage_account_name}.dfs.core.windows.net'
mnt_path = '/mnt/test'
table_path = 'tables'

configs = {
  "fs.azure.account.auth.type": "OAuth",
  "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
  "fs.azure.account.oauth2.client.id": ServicePrincipalID,
  "fs.azure.account.oauth2.client.secret": ServicePrincipalKey,
  "fs.azure.account.oauth2.client.endpoint": Directory,
  "fs.azure.createRemoteFileSystemDuringInitialization": "false"
}

dbutils.fs.mount(
  source = url,
  mount_point = mnt_path,
  extra_configs = configs
)

In [0]:
# Import Libraries from pyspark
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.functions import lit, col, date_format, to_date, add_months, coalesce,sum,substring,date_add,expr,datediff
from datetime import *

In [0]:
dbutils.fs.ls(mnt_path)

#### Create database in Databricks

In [0]:
db_name = 'test'

spark.sql(f"""CREATE DATABASE IF NOT EXISTS {db_name} COMMENT 'This is for test purposes' LOCATION '/user';""")

spark.sql(f'DESCRIBE DATABASE EXTENDED {db_name}').show(truncate=False)

#### Drop database (just in case)

In [0]:
#spark.sql(f"""DROP DATABASE IF EXISTS {db_name};""")

#### Read csv from storage

In [0]:
# source https://sdm.lbl.gov/fastbit/data/samples.html
file_path = f'{mnt_path}/input/star2002-full.csv'
db_table = f'{db_name}.star2000'

In [0]:
schema = StructType([
  StructField('antiNucleus', IntegerType(), nullable=False),
  StructField('eventFile', DoubleType(), nullable=True),
  StructField('eventNumber', DoubleType(), nullable=True),
  StructField('eventTime', DoubleType(), nullable=True),
  StructField('histFile', DoubleType(), nullable=True),
  StructField('multiplicity', DoubleType(), nullable=True),
  StructField('NaboveLb', DoubleType(), nullable=True),
  StructField('NbelowLb', DoubleType(), nullable=True),
  StructField('NLb', DoubleType(), nullable=True),
  StructField('primaryTracks', DoubleType(), nullable=True),
  StructField('prodTime', DoubleType(), nullable=True),
  StructField('Pt', FloatType(), nullable=True),
  StructField('runNumber', FloatType(), nullable=True),
  StructField('vertexX', FloatType(), nullable=True),
  StructField('vertexY', FloatType(), nullable=True),
  StructField('vertexZ', FloatType(), nullable=True)
])

In [0]:
df_csv = (
  spark.read.format("csv")
  .option('header','false')
  .option('delimiter', ',')
  #.option('inferSchema', 'true')
  #.option('format', '%i')
  .schema(schema)
  .load(file_path)
)

In [0]:
df_csv.na.drop()

In [0]:
df_csv.printSchema()

In [0]:
df_csv.show(5)

#### Repartition

In [0]:
#df_csv = df_csv.repartition("antiNucleus")

In [0]:
partition_by = 'antiNucleus'

In [0]:
#df = sqlContext.createDataFrame(df_csv.collect(), schema=schema)

#### Create dataframe and write it to storage

In [0]:
table_file_path = f'{mnt_path}/{table_path}/{db_table}'

In [0]:
(df_csv
 .write
 .partitionBy(partition_by)
 .format('delta')
 .mode('overwrite')
 .option('mergeSchema','true')
 .option('overwriteSchema', 'true') # if table already exists and you changed the type of some column
 .save(table_file_path)
)

In [0]:
#spark.sql("""drop table if exists {}""".format(db_table))

In [0]:
#df_csv.filter(df_csv.antiNucleus.isNull()).show()

In [0]:
# Create table in Databricks

spark.sql("""create table if not exists {} (
antiNucleus	INTEGER ,
eventFile	DOUBLE,
eventNumber	DOUBLE,
eventTime	DOUBLE,
histFile	DOUBLE,
multiplicity	DOUBLE,
NaboveLb	DOUBLE,
NbelowLb	DOUBLE,
NLb		DOUBLE,
primaryTracks	DOUBLE,
prodTime	DOUBLE,
Pt		FLOAT,
runNumber	FLOAT,
vertexX		FLOAT,
vertexY		FLOAT,
vertexZ		FLOAT
) 
using delta location '{}' 
partitioned by ({})""".format(db_table, table_file_path, partition_by))

In [0]:
df = spark.read.table(db_table).where(col('antiNucleus') == 1)
df.show(5, truncate=True)

In [0]:
display(df.describe())

summary,antiNucleus,eventFile,eventNumber,eventTime,histFile,multiplicity,NaboveLb,NbelowLb,NLb,primaryTracks,prodTime,Pt,runNumber,vertexX,vertexY,vertexZ
count,1914455.0,1914455.0,1914455.0,1914455.0,1914455.0,1914455.0,1914455.0,1914455.0,1914455.0,1914455.0,1914455.0,1914455.0,1914455.0,1914455.0,1914455.0,1914455.0
mean,1.0,1398887.5412511656,26193.61024573573,20012577.111241035,1399649.142216453,3406.696779501216,0.747292571515131,0.0192310605368107,501.68830554909886,1188.7437871352422,20020641.974674325,230.3695515849192,2434862.317886813,-0.177630271745753,0.4380967864773584,0.2906302617067617
stddev,0.0,568151.230218667,44757.20877800337,5209.264228815674,567241.0966940541,1978.075752815488,1.6046114571584256,0.1720330424193519,396.2549455636793,671.8925955578408,3648.319784977068,20630.417000709564,472822.0789522565,0.8118345462460678,0.5767854198871102,33.099798574748995
min,1.0,0.0,1.0,20010911.035914,0.0,1.0,0.0,0.0,0.0,1.0,20011028.010128,0.0,2253050.0,-48.705025,-38.519547,-502.69006
max,1.0,2761475.0,586066.0,20030208.180455,2761480.0,10513.0,216.0,8.0,3559.0,2929.0,20030322.004355,24420602.0,4039022.0,39.67056,44.209034,622.54144


#### Clear everything

In [0]:
spark.sql("""drop table if exists {}""".format(db_table))

In [0]:
spark.sql(f"""DROP DATABASE IF EXISTS {db_name};""")

In [0]:
dbutils.fs.rm(table_file_path, recurse=True)

In [0]:
dbutils.fs.ls(mnt_path)

In [0]:
dbutils.fs.unmount(mnt_path) 

#### End of the notebook