# Databricks Test Notebook for Data Storage and Table Creation

### Finding the connection details
- [Finding the connection details to Databricks](https://docs.databricks.com/en/integrations/compute-details.html)
- [Creating a token](https://docs.databricks.com/en/dev-tools/auth/pat.html)
- [Using OAUTH when using a Service Principal](https://docs.databricks.com/en/dev-tools/auth/oauth-m2m.html)




In [None]:
import os
from pathlib import Path

import pandas as pd

from msk_cdm.minio import MinioAPI
from msk_cdm.databricks import DatabricksAPI


## User configurations
- Databricks connection configuration
- Minio connection configuration
- Location of data on Minio
- Location of data to be written on Databricks volume
- Table specifications for data written on volume

In [None]:
# Databricks configurations
overwrite = True
fname_databricks_config = '/gpfs/mindphidata/cdm_repos/databricks_env_test_group.txt'
catalog = 'cbioportal_test'
schema = 'cdm_test'
volume = 'cdm_write_volume'

# Minio Configuratios
fname_minio_config = '/gpfs/mindphidata/fongc2/minio_env.txt'
file_minio = 'demographics/ddp_demographics.tsv'
sep = '\t'

dir_volume = os.path.join('/Volumes',catalog,schema,volume)
fname_save_databricks = os.path.join(dir_volume, file_minio)
table = Path(file_minio).stem



### Create dictionary to convert your dataframe object into a Spark SQL Table

In [None]:
dict_database_table_info = {
    'catalog': catalog,
    'schema': schema,
    'volume_path': fname_save_databricks,
    'table': table,
    'sep': sep
}

## Get Data from MinIO as an Example

In [None]:
obj_minio = MinioAPI(fname_minio_env=fname_minio_config)
obj = obj_minio.load_obj(path_object=file_minio)
df = pd.read_csv(obj, sep='\t')

In [None]:
df.head();

## Write Dataframe into Databricks
### Instantiate Databricks API module

In [None]:
obj_db = DatabricksAPI(fname_databricks_env=fname_databricks_config)




### Write data to volume and create table in two separate steps
This demonstrates the data can be written to the volume, and then as an option at a later point, can be exposed as a table

In [None]:
obj_db.write_db_obj(
    df=df,
    volume_path=fname_save_databricks,
    sep=sep,
    overwrite=overwrite
)

In [None]:
# Now that object is created, a table can be created for Spark SQL use
obj_db.create_table_from_volume(
    dict_database_table_info=dict_database_table_info
)

### Do the same, but in one step
This step is preferable for creating pipeline, insuring that a table and ojbect on volume are always in sync


In [None]:
obj_db.write_db_obj(
    df=df, 
    volume_path=fname_save_databricks,
    sep=sep,
    overwrite=overwrite,
    dict_database_table_info=dict_database_table_info
)

## Query the data just uploaded to Databricks
### Using SQL 
Analogous to Dremio

In [None]:
g = f"""select  * from {catalog}.{schema}.{table}"""
g

In [None]:
df_demo_sql = obj_db.query_from_sql(sql=g)

In [None]:
df_demo_sql.describe()

### Using the volume download process 
Analogous to MinIO

In [None]:
# read/download
df_demo_vol = obj_db.read_db_obj(volume_path=fname_save_databricks, sep='\t')

In [None]:
df_demo_vol.head()