# Databricks Project Setup

This notebook is part of the OMOP CDM series available at [GitHub](https://github.com/databricks-industry-solutions/omop-cdm).

For more details, visit [Databricks Blog](https://www.databricks.com/blog/2021/07/19/unlocking-the-power-of-health-data-with-a-modern-data-lakehouse.html).

## Add Widgets

In [None]:
import mlflow

project_name = 'omop-cdm-100K' 

In [None]:
# Specify path to raw data for each project
project_data_paths = {
    'omop-cdm-100K': "s3://hls-eng-data-public/data/rwe/all-states-90K/",
    'omop-cdm-10K': "s3://hls-eng-data-public/data/synthea/",
    'psm': "s3://hls-eng-data-public/data/rwe/dbx-covid-sim/"
}

In [None]:
import mlflow

class SolAccUtil:
    def __init__(self, project_name, data_path=None, base_path=None):
        from pyspark.sql import SparkSession

        spark = SparkSession.builder.appName("OMOP531").getOrCreate()
        
        user = spark.sql("SELECT current_user()").collect()[0][0]
        project_name = project_name.strip().replace(' ', '-')
        self.settings = {}

        if base_path is not None:
            base_path = base_path
        else:
            base_path = f'/home/{user}/health-lakehouse'

        if data_path is not None:
            data_path = data_path
        else:
            data_path = project_data_paths[project_name]

        delta_path = f'{base_path}/{project_name}/delta'

        experiment_name = f'/Users/{user}/{project_name}'
        experiment = mlflow.get_experiment_by_name(experiment_name)
        if not experiment:
            experiment_id = mlflow.create_experiment(experiment_name)
            experiment = mlflow.get_experiment(experiment_id)

        self.settings['base_path'] = base_path
        self.settings['delta_path'] = delta_path
        self.settings['data_path'] = data_path
        self.settings['experiment_name'] = experiment.name
        self.settings['experiment_id'] = experiment.experiment_id
        self.settings['artifact_location'] = experiment.artifact_location
        self.settings['tags'] = experiment.tags

    def print_info(self):
        for key, val in self.settings.items():
            print(f"{key}: {val}")

    def display_data(self):
        from pyspark.sql.functions import lit
        from pyspark.sql import SparkSession

        spark = SparkSession.builder.getOrCreate()
        files = spark.sql(f"SHOW FILES IN '{self.settings['data_path']}'")
        if files.count() == 0:
            print("No data available. Please run load_remote_data(<url for the data>)")
        else:
            print("Data available in", self.settings['data_path'])
            files.withColumn("project", lit(self.settings["experiment_name"])).show()


In [None]:
# Define project settings
project_settings = SolAccUtil(project_name=project_name)

In [None]:
# Write configurations for later access
import json

with open(f'/tmp/{project_name}_configs.json', 'w') as f:
    json.dump(project_settings.settings, f, indent=4)

print("Configurations saved.")

In [None]:
# Display project settings
project_settings.print_info()