databricks · susodapop · Jan 18, 2024 · May 9, 2023 · May 9, 2023 · May 10, 2023
@@ -1,6 +1,8 @@
 
 ## dbt-databricks 1.5.2 TBD
 
+- Fix: Python submissions should always use the auth token in `profiles.yml` (ignore .netrc file)
+
 ### Features
 - Added support for model contracts ([#336](https://github.com/databricks/dbt-databricks/pull/336))
 

@@ -1,4 +1,4 @@
-from typing import Any, Dict, Tuple, Optional, Callable
+from typing import Any, Dict, Tuple, Optional, Callable, Union
 
 from dbt.adapters.databricks.__version__ import version
 from dbt.adapters.databricks.connections import DatabricksCredentials
@@ -12,7 +12,7 @@
 import dbt.exceptions
 from dbt.adapters.base import PythonJobHelper
 from dbt.adapters.spark import __version__
-from databricks.sdk.core import CredentialsProvider
+from databricks.sdk.core import CredentialsProvider, HeaderFactory
 
 logger = AdapterLogger("Databricks")
 
@@ -22,6 +22,22 @@
 DBT_SPARK_VERSION = __version__.version
 
 
+class BearerAuth(requests.auth.AuthBase):
+    """See issue #337.
+
+    We use this mix-in to stop requests from implicitly reading .netrc
+
+    Solution taken from SO post in issue description.
+    """
+
+    def __init__(self, headers: HeaderFactory):
+        self.headers = headers()
+
+    def __call__(self, r: requests.PreparedRequest) -> requests.PreparedRequest:
+        r.headers.update(**self.headers)
+        return r
+
+
 class BaseDatabricksHelper(PythonJobHelper):
     def __init__(self, parsed_model: Dict, credentials: DatabricksCredentials) -> None:
         self.credentials = credentials
@@ -31,11 +47,12 @@ def __init__(self, parsed_model: Dict, credentials: DatabricksCredentials) -> No
         self.timeout = self.get_timeout()
         self.polling_interval = DEFAULT_POLLING_INTERVAL
         self.check_credentials()
-        self.auth_header = {
-            "Authorization": f"Bearer {self.credentials.token}",
+        self.extra_headers = {
             "User-Agent": f"dbt-labs-dbt-spark/{DBT_SPARK_VERSION} (Databricks)",
         }
 
+        self.auth: Union[BearerAuth, None] = None
+
     @property
     def cluster_id(self) -> str:
         return self.parsed_model["config"].get("cluster_id", self.credentials.cluster_id)
@@ -54,7 +71,8 @@ def check_credentials(self) -> None:
     def _create_work_dir(self, path: str) -> None:
         response = requests.post(
             f"https://{self.credentials.host}/api/2.0/workspace/mkdirs",
-            headers=self.auth_header,
+            auth=self.auth,
+            headers=self.extra_headers,
             json={
                 "path": path,
             },
@@ -68,7 +86,8 @@ def _upload_notebook(self, path: str, compiled_code: str) -> None:
         b64_encoded_content = base64.b64encode(compiled_code.encode()).decode()
         response = requests.post(
             f"https://{self.credentials.host}/api/2.0/workspace/import",
-            headers=self.auth_header,
+            auth=self.auth,
+            headers=self.extra_headers,
             json={
                 "path": path,
                 "content": b64_encoded_content,
@@ -102,7 +121,8 @@ def _submit_job(self, path: str, cluster_spec: dict) -> str:
         job_spec.update({"libraries": libraries})  # type: ignore
         submit_response = requests.post(
             f"https://{self.credentials.host}/api/2.1/jobs/runs/submit",
-            headers=self.auth_header,
+            auth=self.auth,
+            headers=self.extra_headers,
             json=job_spec,
         )
         if submit_response.status_code != 200:
@@ -126,7 +146,8 @@ def _submit_through_notebook(self, compiled_code: str, cluster_spec: dict) -> No
             status_func=requests.get,
             status_func_kwargs={
                 "url": f"https://{self.credentials.host}/api/2.1/jobs/runs/get?run_id={run_id}",
-                "headers": self.auth_header,
+                "auth": self.auth,
+                "headers": self.extra_headers,
             },
             get_state_func=lambda response: response.json()["state"]["life_cycle_state"],
             terminal_states=("TERMINATED", "SKIPPED", "INTERNAL_ERROR"),
@@ -137,7 +158,8 @@ def _submit_through_notebook(self, compiled_code: str, cluster_spec: dict) -> No
         # get end state to return to user
         run_output = requests.get(
             f"https://{self.credentials.host}" f"/api/2.1/jobs/runs/get-output?run_id={run_id}",
-            headers=self.auth_header,
+            auth=self.auth,
+            headers=self.extra_headers,
         )
         json_run_output = run_output.json()
         result_state = json_run_output["metadata"]["state"]["result_state"]
@@ -197,9 +219,14 @@ def submit(self, compiled_code: str) -> None:
 
 class DBContext:
     def __init__(
-        self, credentials: DatabricksCredentials, cluster_id: str, auth_header: dict
+        self,
+        credentials: DatabricksCredentials,
+        cluster_id: str,
+        auth: Union[BearerAuth, None],
+        extra_headers: dict,
     ) -> None:
-        self.auth_header = auth_header
+        self.auth = auth
+        self.extra_headers = extra_headers
         self.cluster_id = cluster_id
         self.host = credentials.host
 
@@ -214,7 +241,8 @@ def create(self) -> str:
 
         response = requests.post(
             f"https://{self.host}/api/1.2/contexts/create",
-            headers=self.auth_header,
+            auth=self.auth,
+            headers=self.extra_headers,
             json={
                 "clusterId": self.cluster_id,
                 "language": SUBMISSION_LANGUAGE,
@@ -230,7 +258,8 @@ def destroy(self, context_id: str) -> str:
         # https://docs.databricks.com/dev-tools/api/1.2/index.html#delete-an-execution-context
         response = requests.post(
             f"https://{self.host}/api/1.2/contexts/destroy",
-            headers=self.auth_header,
+            auth=self.auth,
+            headers=self.extra_headers,
             json={
                 "clusterId": self.cluster_id,
                 "contextId": context_id,
@@ -248,7 +277,8 @@ def get_cluster_status(self) -> Dict:
 
         response = requests.get(
             f"https://{self.host}/api/2.0/clusters/get",
-            headers=self.auth_header,
+            auth=self.auth,
+            headers=self.extra_headers,
             json={"cluster_id": self.cluster_id},
         )
         if response.status_code != 200:
@@ -271,7 +301,8 @@ def start_cluster(self) -> None:
 
         response = requests.post(
             f"https://{self.host}/api/2.0/clusters/start",
-            headers=self.auth_header,
+            auth=self.auth,
+            headers=self.extra_headers,
             json={"cluster_id": self.cluster_id},
         )
         if response.status_code != 200:
@@ -300,17 +331,23 @@ def get_elapsed() -> float:
 
 class DBCommand:
     def __init__(
-        self, credentials: DatabricksCredentials, cluster_id: str, auth_header: dict
+        self,
+        credentials: DatabricksCredentials,
+        cluster_id: str,
+        auth: Union[BearerAuth, None],
+        extra_headers: dict,
     ) -> None:
-        self.auth_header = auth_header
+        self.auth = auth
+        self.extra_headers = extra_headers
         self.cluster_id = cluster_id
         self.host = credentials.host
 
     def execute(self, context_id: str, command: str) -> str:
         # https://docs.databricks.com/dev-tools/api/1.2/index.html#run-a-command
         response = requests.post(
             f"https://{self.host}/api/1.2/commands/execute",
-            headers=self.auth_header,
+            auth=self.auth,
+            headers=self.extra_headers,
             json={
                 "clusterId": self.cluster_id,
                 "contextId": context_id,
@@ -328,7 +365,8 @@ def status(self, context_id: str, command_id: str) -> Dict[str, Any]:
         # https://docs.databricks.com/dev-tools/api/1.2/index.html#get-information-about-a-command
         response = requests.get(
             f"https://{self.host}/api/1.2/commands/status",
-            headers=self.auth_header,
+            auth=self.auth,
+            headers=self.extra_headers,
             params={
                 "clusterId": self.cluster_id,
                 "contextId": context_id,
@@ -354,8 +392,8 @@ def submit(self, compiled_code: str) -> None:
         if self.parsed_model["config"].get("create_notebook", False):
             self._submit_through_notebook(compiled_code, {"existing_cluster_id": self.cluster_id})
         else:
-            context = DBContext(self.credentials, self.cluster_id, self.auth_header)
-            command = DBCommand(self.credentials, self.cluster_id, self.auth_header)
+            context = DBContext(self.credentials, self.cluster_id, self.auth, self.extra_headers)
+            command = DBCommand(self.credentials, self.cluster_id, self.auth, self.extra_headers)
             context_id = context.create()
             try:
                 command_id = command.execute(context_id, compiled_code)
@@ -404,9 +442,9 @@ def __init__(self, parsed_model: Dict, credentials: DatabricksCredentials) -> No
         )
         self._credentials_provider = credentials.authenticate(self._credentials_provider)
         header_factory = self._credentials_provider()
-        headers = header_factory()
+        self.auth = BearerAuth(header_factory)
 
-        self.auth_header.update({"User-Agent": user_agent, **http_headers, **headers})
+        self.extra_headers.update({"User-Agent": user_agent, **http_headers})
 
     @property
     def cluster_id(self) -> Optional[str]:  # type: ignore[override]