In [0]:
# Purpose: Orchestrate the ingestion of files from Git Repo to both Volume and DBFS paths.
#Step 1: Fetch Latest Files from GitHub

import requests

# GitHub configuration
github_repo_raw_url = "https://raw.githubusercontent.com/jen-mejarito/medisure_jen/main/data/"
file_names = ["claims_batch.csv", "claims_stream.json", "diagnosis_ref.csv", "members.csv", "providers.json"]

# Target landing zones
volume_landing_path = "/Volumes/medisure_jen/bronze/landing_zone/" #batch processing
dbfs_landing_path = "/Volumes/medisure_jen/bronze/autoloader_landing/" # for autoloadeer

# Create the DBFS directory if it doesn't exist
dbutils.fs.mkdirs(dbfs_landing_path)
print(f"Ensuring DBFS path exists: {dbfs_landing_path}")

#Step 2: Copy Each File from Git to Both Locations
for file_name in file_names:
    file_url = f"{github_repo_raw_url}{file_name}"

    print(f"Downloading {file_name} from Git...")
    try:
        response = requests.get(file_url)
        response.raise_for_status()  # Raises an exception for HTTP errors 
        file_content = response.text
        
        # 1. Write to Volume (for batch processing)
        volume_target_path = f"{volume_landing_path}{file_name}"
        print(f"Writing {file_name} to Volume: {volume_target_path}")
        dbutils.fs.put(volume_target_path, file_content)
        
        # 2. If it's the streaming file, ALSO write to DBFS (for Auto Loader)
        if file_name == "claims_stream.json":
            dbfs_target_path = f"{dbfs_landing_path}{file_name}"
            print(f"Writing {file_name} to DBFS for Auto Loader: {dbfs_target_path}")
            dbutils.fs.put(dbfs_target_path, file_content)
        
        print(f"Successfully transferred {file_name}\n")
        
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {file_name}: {e}\n")
    except Exception as e:
        print(f"An unexpected error occurred with {file_name}: {e}\n")

print("All files processed from GitHub.")

#Step 3: Verify Files in Both Locations
print("Files in Volume landing zone (/Volumes/...):")
display(dbutils.fs.ls(volume_landing_path))

print("\nFiles in DBFS landing zone (dbfs:/tmp/...) - for Auto Loader:")
display(dbutils.fs.ls(dbfs_landing_path))