In [None]:
# COMMAND ----------

# Databricks job configuration for automated testing
def create_databricks_test_job_config() -> Dict[str, Any]:
    """
    Pure function that creates Databricks job configuration for CI/CD testing.
    This configuration can be used with Databricks CLI or REST API.
    """
    job_config = {
        "name": "PySpark-Functional-Tests-CI",
        "new_cluster": {
            "spark_version": "12.2.x-scala2.12",
            "node_type_id": "i3.xlarge",
            "num_workers": 2,
            "spark_conf": {
                "spark.sql.adaptive.enabled": "true",
                "spark.sql.adaptive.coalescePartitions.enabled": "true",
                "spark.databricks.cluster.profile": "singleNode",
                "spark.master": "local[*]"
            },
            "custom_tags": {
                "ResourceClass": "SingleNode",
                "Environment": "CI",
                "Project": "PySpark-Functional-Pipeline"
            }
        },
        "libraries": [
            {"pypi": {"package": "pytest==7.2.0"}},
            {"pypi": {"package": "pytest-cov==4.0.0"}},
            {"pypi": {"package": "pyyaml==6.0"}},
            {"pypi": {"package": "great-expectations==0.15.50"}}
        ],
        "notebook_task": {
            "notebook_path": "/Repos/shared/pyspark-pipeline/notebooks/test_runner",
            "base_parameters": {
                "environment": "ci",
                "test_suite": "full",
                "coverage_threshold": "80"
            }
        },
        "timeout_seconds": 3600,
        "max_retries": 1,
        "email_notifications": {
            "on_failure": ["team@company.com"],
            "on_success": ["team@company.com"]
        },
        "webhook_notifications": {
            "on_failure": [
                {
                    "id": "slack-webhook",
                    "url": "https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK"
                }
            ]
        }
    }
    
    return job_config

# Quality gate job configuration
def create_quality_gate_job_config() -> Dict[str, Any]:
    """
    Pure function for quality gate validation job in Databricks.
    """
    quality_config = {
        "name": "Data-Quality-Gates",
        "new_cluster": {
            "spark_version": "12.2.x-scala2.12",
            "node_type_id": "i3.xlarge",
            "num_workers": 1,
            "spark_conf": {
                "spark.sql.adaptive.enabled": "true"
            }
        },
        "libraries": [
            {"pypi": {"package": "great-expectations==0.15.50"}},
            {"pypi": {"package": "delta-lake==2.2.0"}}
        ],
        "python_wheel_task": {
            "package_name": "pyspark_functional_pipeline",
            "entry_point": "quality_check",
            "parameters": ["--environment", "staging", "--strict-mode"]
        },
        "timeout_seconds": 1800
    }
    
    return quality_config

# Test execution notebook template
test_runner_notebook = '''
# Databricks notebook source
# MAGIC %md
# MAGIC # Test Runner Notebook
# MAGIC 
# MAGIC This notebook is executed by CI/CD pipelines to run tests in Databricks environment.

# COMMAND ----------

# MAGIC %pip install pytest pytest-cov

# COMMAND ----------

import pytest
import sys
import os
from pyspark.sql import SparkSession

# Get parameters from job
environment = dbutils.widgets.get("environment")
test_suite = dbutils.widgets.get("test_suite")
coverage_threshold = float(dbutils.widgets.get("coverage_threshold"))

print(f"Running tests with environment: {environment}")
print(f"Test suite: {test_suite}")
print(f"Coverage threshold: {coverage_threshold}%")

# COMMAND ----------

# Set up test environment
spark = SparkSession.builder.getOrCreate()

# Add project modules to path
sys.path.append("/Workspace/Repos/shared/pyspark-pipeline/src")

# COMMAND ----------

# Run tests based on suite parameter
if test_suite == "unit":
    test_path = "/Workspace/Repos/shared/pyspark-pipeline/tests/unit"
elif test_suite == "integration":
    test_path = "/Workspace/Repos/shared/pyspark-pipeline/tests/integration"
else:  # full
    test_path = "/Workspace/Repos/shared/pyspark-pipeline/tests"

# Execute tests
exit_code = pytest.main([
    test_path,
    "-v",
    "--tb=short",
    f"--cov=/Workspace/Repos/shared/pyspark-pipeline/src",
    f"--cov-fail-under={coverage_threshold}",
    "--cov-report=term-missing",
    "--junitxml=test-results.xml"
])

# COMMAND ----------

# Handle test results
if exit_code == 0:
    print("✅ All tests passed!")
    dbutils.notebook.exit("SUCCESS")
else:
    print("❌ Tests failed!")
    dbutils.notebook.exit("FAILURE")
'''

print("📋 Databricks Test Job Configuration:")
test_config = create_databricks_test_job_config()
print(json.dumps(test_config, indent=2))

print("\n📋 Quality Gate Job Configuration:")
quality_config = create_quality_gate_job_config()
print(json.dumps(quality_config, indent=2))

print(f"\n📝 Test Runner Notebook Template:")
print("=" * 60)
print(test_runner_notebook)

# COMMAND ----------

# MAGIC %md
# MAGIC ## 3. Databricks Job Configuration for Testing
# MAGIC 
# MAGIC Let's create Databricks job configurations that can be used in CI/CD pipelines for running tests in the actual Databricks environment.

In [None]:
# COMMAND ----------

# GitHub Actions CI pipeline configuration
github_actions_ci = """
name: PySpark Functional Tests CI

on:
  push:
    branches: [ main, develop ]
  pull_request:
    branches: [ main, develop ]

env:
  PYTHON_VERSION: '3.8'
  SPARK_VERSION: '3.3.0'
  JAVA_VERSION: '11'

jobs:
  quality-checks:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v3
    
    - name: Set up Python
      uses: actions/setup-python@v4
      with:
        python-version: ${{ env.PYTHON_VERSION }}
    
    - name: Set up Java
      uses: actions/setup-java@v3
      with:
        java-version: ${{ env.JAVA_VERSION }}
        distribution: 'temurin'
    
    - name: Install dependencies
      run: |
        pip install -r requirements.txt
        pip install pytest pytest-cov black flake8 mypy
    
    - name: Code formatting check
      run: black --check src/ tests/
    
    - name: Linting
      run: flake8 src/ tests/
    
    - name: Type checking
      run: mypy src/
    
    - name: Security scan
      run: |
        pip install bandit
        bandit -r src/

  unit-tests:
    runs-on: ubuntu-latest
    needs: quality-checks
    strategy:
      matrix:
        python-version: ['3.8', '3.9', '3.10']
    
    steps:
    - uses: actions/checkout@v3
    
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v4
      with:
        python-version: ${{ matrix.python-version }}
    
    - name: Set up Java
      uses: actions/setup-java@v3
      with:
        java-version: ${{ env.JAVA_VERSION }}
        distribution: 'temurin'
    
    - name: Install dependencies
      run: |
        pip install -r requirements.txt
        pip install pytest pytest-cov pytest-xdist
    
    - name: Run unit tests
      run: |
        pytest tests/unit/ -v --cov=src --cov-report=xml --cov-report=term-missing
    
    - name: Upload coverage to Codecov
      uses: codecov/codecov-action@v3
      with:
        file: ./coverage.xml
        flags: unittests

  integration-tests:
    runs-on: ubuntu-latest
    needs: unit-tests
    
    steps:
    - uses: actions/checkout@v3
    
    - name: Set up Python
      uses: actions/setup-python@v4
      with:
        python-version: ${{ env.PYTHON_VERSION }}
    
    - name: Set up Java
      uses: actions/setup-java@v3
      with:
        java-version: ${{ env.JAVA_VERSION }}
        distribution: 'temurin'
    
    - name: Install dependencies
      run: |
        pip install -r requirements.txt
        pip install pytest
    
    - name: Run integration tests
      run: |
        pytest tests/integration/ -v --tb=short
    
    - name: Generate test report
      run: |
        pytest tests/ --junitxml=test-results.xml
    
    - name: Upload test results
      uses: actions/upload-artifact@v3
      with:
        name: test-results
        path: test-results.xml

  databricks-validation:
    runs-on: ubuntu-latest
    needs: integration-tests
    if: github.ref == 'refs/heads/main'
    
    steps:
    - uses: actions/checkout@v3
    
    - name: Install Databricks CLI
      run: |
        pip install databricks-cli
    
    - name: Configure Databricks CLI
      run: |
        echo "[DEFAULT]" > ~/.databrickscfg
        echo "host = ${{ secrets.DATABRICKS_HOST }}" >> ~/.databrickscfg
        echo "token = ${{ secrets.DATABRICKS_TOKEN }}" >> ~/.databrickscfg
    
    - name: Upload notebooks to Databricks
      run: |
        databricks workspace import_dir notebooks /Repos/shared/pyspark-pipeline/notebooks --overwrite
    
    - name: Run Databricks tests
      run: |
        databricks runs submit --json-file .github/workflows/databricks-test-job.json
"""

print("📋 GitHub Actions CI Pipeline Configuration:")
print("=" * 60)
print(github_actions_ci)

# Azure DevOps pipeline configuration
azure_devops_pipeline = """
# Azure DevOps Pipeline for PySpark Functional Tests
trigger:
  branches:
    include:
      - main
      - develop

pr:
  branches:
    include:
      - main
      - develop

variables:
  pythonVersion: '3.8'
  sparkVersion: '3.3.0'

stages:
- stage: QualityGates
  displayName: 'Code Quality Gates'
  jobs:
  - job: CodeQuality
    displayName: 'Code Quality Checks'
    pool:
      vmImage: 'ubuntu-latest'
    
    steps:
    - task: UsePythonVersion@0
      inputs:
        versionSpec: $(pythonVersion)
        displayName: 'Use Python $(pythonVersion)'
    
    - script: |
        pip install -r requirements.txt
        pip install black flake8 mypy bandit pytest
      displayName: 'Install dependencies'
    
    - script: |
        black --check src/ tests/
      displayName: 'Code formatting check'
    
    - script: |
        flake8 src/ tests/
      displayName: 'Linting'
    
    - script: |
        mypy src/
      displayName: 'Type checking'
    
    - script: |
        bandit -r src/
      displayName: 'Security scan'

- stage: Testing
  displayName: 'Testing'
  dependsOn: QualityGates
  jobs:
  - job: UnitTests
    displayName: 'Unit Tests'
    pool:
      vmImage: 'ubuntu-latest'
    
    steps:
    - task: UsePythonVersion@0
      inputs:
        versionSpec: $(pythonVersion)
    
    - script: |
        pip install -r requirements.txt
        pip install pytest pytest-cov pytest-azurepipelines
      displayName: 'Install dependencies'
    
    - script: |
        pytest tests/unit/ --cov=src --cov-report=xml --cov-report=html --junitxml=test-results.xml
      displayName: 'Run unit tests'
    
    - task: PublishTestResults@2
      inputs:
        testResultsFiles: 'test-results.xml'
        testRunTitle: 'Unit Tests'
    
    - task: PublishCodeCoverageResults@1
      inputs:
        codeCoverageTool: 'Cobertura'
        summaryFileLocation: 'coverage.xml'
        reportDirectory: 'htmlcov'

  - job: IntegrationTests
    displayName: 'Integration Tests'
    dependsOn: UnitTests
    pool:
      vmImage: 'ubuntu-latest'
    
    steps:
    - task: UsePythonVersion@0
      inputs:
        versionSpec: $(pythonVersion)
    
    - script: |
        pip install -r requirements.txt
        pip install pytest pytest-azurepipelines
      displayName: 'Install dependencies'
    
    - script: |
        pytest tests/integration/ --junitxml=integration-results.xml
      displayName: 'Run integration tests'
    
    - task: PublishTestResults@2
      inputs:
        testResultsFiles: 'integration-results.xml'
        testRunTitle: 'Integration Tests'

- stage: DatabricksValidation
  displayName: 'Databricks Validation'
  dependsOn: Testing
  condition: and(succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
  jobs:
  - job: DatabricksTests
    displayName: 'Databricks Environment Tests'
    pool:
      vmImage: 'ubuntu-latest'
    
    steps:
    - script: |
        pip install databricks-cli
      displayName: 'Install Databricks CLI'
    
    - script: |
        echo "[DEFAULT]" > ~/.databrickscfg
        echo "host = $(DATABRICKS_HOST)" >> ~/.databrickscfg
        echo "token = $(DATABRICKS_TOKEN)" >> ~/.databrickscfg
      displayName: 'Configure Databricks CLI'
    
    - script: |
        databricks workspace import_dir notebooks /Repos/shared/pyspark-pipeline/notebooks --overwrite
      displayName: 'Upload notebooks'
    
    - script: |
        databricks runs submit --json-file deployment/databricks-test-job.json
      displayName: 'Run Databricks tests'
"""

print("\n📋 Azure DevOps Pipeline Configuration:")
print("=" * 60)
print(azure_devops_pipeline)

# COMMAND ----------

# MAGIC %md
# MAGIC ## 2. CI/CD Pipeline Configuration
# MAGIC 
# MAGIC Let's create CI/CD pipeline configurations that integrate functional testing with Databricks. We'll cover GitHub Actions, Azure DevOps, and other popular CI/CD platforms.

In [None]:
# COMMAND ----------

# Recommended project structure for PySpark functional programming with CI/CD
project_structure = """
pyspark-data-project/
├── .github/
│   └── workflows/
│       ├── ci.yml              # CI pipeline configuration
│       ├── cd.yml              # CD pipeline configuration
│       └── quality-gates.yml   # Quality gate automation
├── src/
│   ├── __init__.py
│   ├── data_processing/
│   │   ├── __init__.py
│   │   ├── transformations.py  # Pure transformation functions
│   │   ├── validations.py      # Data validation functions
│   │   └── aggregations.py     # Aggregation functions
│   ├── utils/
│   │   ├── __init__.py
│   │   ├── spark_utils.py      # Spark utility functions
│   │   ├── config.py           # Configuration management
│   │   └── logging_utils.py    # Logging utilities
│   └── jobs/
│       ├── __init__.py
│       ├── etl_pipeline.py     # Main ETL pipeline
│       └── data_quality_check.py
├── tests/
│   ├── __init__.py
│   ├── unit/
│   │   ├── __init__.py
│   │   ├── test_transformations.py
│   │   ├── test_validations.py
│   │   └── test_aggregations.py
│   ├── integration/
│   │   ├── __init__.py
│   │   ├── test_etl_pipeline.py
│   │   └── test_data_quality.py
│   ├── fixtures/
│   │   ├── __init__.py
│   │   └── test_data.py        # Test data generators
│   └── conftest.py            # Pytest configuration
├── config/
│   ├── dev.yml                # Development environment config
│   ├── staging.yml            # Staging environment config
│   └── prod.yml               # Production environment config
├── notebooks/
│   ├── exploratory/           # Data exploration notebooks
│   ├── experiments/           # ML experiment notebooks
│   └── demo/                  # Demo and documentation notebooks
├── scripts/
│   ├── run_tests.sh          # Test execution script
│   ├── deploy.sh             # Deployment script
│   └── quality_check.py      # Quality gate validation
├── requirements.txt          # Python dependencies
├── setup.py                  # Package setup configuration
├── pytest.ini               # Pytest configuration
├── .gitignore               # Git ignore patterns
└── README.md                # Project documentation
"""

print("📁 Recommended Project Structure:")
print(project_structure)

# Core principles for CI/CD-friendly structure
print("\n🎯 Key Structural Principles:")
print("1. ✅ Separation of Concerns - Clear module boundaries")
print("2. ✅ Testable Architecture - Easy unit and integration testing")
print("3. ✅ Environment Agnostic - Configuration externalized")
print("4. ✅ Functional Organization - Pure functions grouped logically")
print("5. ✅ CI/CD Ready - Standard structure for automation")

# Configuration management for different environments
def create_environment_config() -> Dict[str, Dict[str, Any]]:
    """
    Pure function that creates environment-specific configurations.
    Demonstrates functional approach to configuration management.
    """
    base_config = {
        "spark": {
            "app_name": "PySpark-Functional-Pipeline",
            "executor_memory": "2g",
            "driver_memory": "1g"
        },
        "data": {
            "input_format": "delta",
            "output_format": "delta"
        },
        "quality": {
            "enable_validation": True,
            "failure_threshold": 0.05
        }
    }
    
    environments = {
        "dev": {
            **base_config,
            "data": {
                **base_config["data"],
                "input_path": "/tmp/dev/input",
                "output_path": "/tmp/dev/output"
            },
            "spark": {
                **base_config["spark"],
                "executor_instances": 1
            }
        },
        "staging": {
            **base_config,
            "data": {
                **base_config["data"],
                "input_path": "/mnt/staging/input",
                "output_path": "/mnt/staging/output"
            },
            "spark": {
                **base_config["spark"],
                "executor_instances": 3
            }
        },
        "prod": {
            **base_config,
            "data": {
                **base_config["data"],
                "input_path": "/mnt/prod/input",
                "output_path": "/mnt/prod/output"
            },
            "spark": {
                **base_config["spark"],
                "executor_instances": 10,
                "executor_memory": "4g"
            },
            "quality": {
                **base_config["quality"],
                "failure_threshold": 0.01  # Stricter in production
            }
        }
    }
    
    return environments

# Test the configuration function
configs = create_environment_config()
print(f"\n📋 Environment Configurations Created:")
for env, config in configs.items():
    print(f"   {env}: {config['spark']['executor_instances']} executors, {config['quality']['failure_threshold']} failure threshold")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 1. Project Structure for CI/CD
# MAGIC 
# MAGIC A well-organized project structure is crucial for effective CI/CD integration. Let's explore functional project organization patterns that work well with Databricks Repos and automated testing.

In [None]:
# COMMAND ----------

# Essential imports for CI/CD integration
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import *
import pyspark.sql.functions as F
from typing import Dict, List, Tuple, Optional, Callable, Any, Union
from functools import reduce, partial
import json
import os
import yaml
from datetime import datetime
import subprocess
import sys

# Initialize Spark session
spark = SparkSession.builder.appName("CICDIntegration").getOrCreate()

print("✅ Setup complete - Ready for CI/CD integration patterns!")

# Databricks notebook source
# MAGIC %md
# MAGIC # 3.3 Integrating Tests with Databricks Repos and CI/CD
# MAGIC 
# MAGIC This notebook demonstrates how to integrate functional PySpark tests with Databricks Repos and CI/CD pipelines. We'll explore version control, automated testing, and deployment strategies that maintain functional programming principles.
# MAGIC 
# MAGIC ## Learning Objectives
# MAGIC 
# MAGIC By the end of this notebook, you will understand how to:
# MAGIC - Structure PySpark projects for version control and CI/CD
# MAGIC - Configure Databricks Repos for collaborative development
# MAGIC - Create CI/CD pipelines with functional tests
# MAGIC - Implement deployment strategies with quality gates
# MAGIC - Handle environment-specific configurations functionally
# MAGIC - Monitor test execution and quality metrics
# MAGIC 
# MAGIC ## Prerequisites
# MAGIC 
# MAGIC - Understanding of Git version control
# MAGIC - Knowledge of CI/CD concepts
# MAGIC - Familiarity with Databricks Repos
# MAGIC - Experience with functional testing patterns