From 3fd2c5ecf812c1b2a02ab084136cb3a76c1edc35 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 14 May 2024 20:15:59 +0200 Subject: [PATCH] Link to RESTClient docs --- .../docs/tutorial/grouping-resources.md | 46 ++++--------------- .../docs/tutorial/load-data-from-an-api.md | 10 +++- 2 files changed, 18 insertions(+), 38 deletions(-) diff --git a/docs/website/docs/tutorial/grouping-resources.md b/docs/website/docs/tutorial/grouping-resources.md index 1bf95e1ddc..da27c4f95b 100644 --- a/docs/website/docs/tutorial/grouping-resources.md +++ b/docs/website/docs/tutorial/grouping-resources.md @@ -141,13 +141,14 @@ For the next step we'd want to get the [number of repository clones](https://doc Let's handle this by changing our `fetch_github_data()` first: ```py +from dlt.sources.helpers.rest_client.auth import BearerTokenAuth + def fetch_github_data(endpoint, params={}, access_token=None): - headers = {"Authorization": f"Bearer {access_token}"} if access_token else {} url = f"{BASE_GITHUB_URL}/{endpoint}" return paginate( url, params=params, - headers=headers, + auth=BearerTokenAuth(token=access_token) if access_token else None, ) @@ -200,28 +201,7 @@ access_token = "ghp_A...3aRY" Now we can run the script and it will load the data from the `traffic/clones` endpoint: ```py -import dlt -from dlt.sources.helpers import requests - -BASE_GITHUB_URL = "https://api.github.com/repos/dlt-hub/dlt" - - -def fetch_github_data(endpoint, params={}, access_token=None): - """Fetch data from GitHub API based on endpoint and params.""" - headers = {"Authorization": f"Bearer {access_token}"} if access_token else {} - - url = f"{BASE_GITHUB_URL}/{endpoint}" - - while True: - response = requests.get(url, params=params, headers=headers) - response.raise_for_status() - yield response.json() - - # get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] - +... @dlt.source def github_source( @@ -258,19 +238,12 @@ BASE_GITHUB_URL = "https://api.github.com/repos/{repo_name}" def fetch_github_data(repo_name, endpoint, params={}, access_token=None): """Fetch data from GitHub API based on repo_name, endpoint, and params.""" - headers = {"Authorization": f"Bearer {access_token}"} if access_token else {} - url = BASE_GITHUB_URL.format(repo_name=repo_name) + f"/{endpoint}" - - while True: - response = requests.get(url, params=params, headers=headers) - response.raise_for_status() - yield response.json() - - # Get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] + return paginate( + url, + params=params, + auth=BearerTokenAuth(token=access_token) if access_token else None, + ) @dlt.source @@ -318,5 +291,6 @@ Interested in learning more? Here are some suggestions: - [Pass config and credentials into your sources and resources](../general-usage/credentials). - [Run in production: inspecting, tracing, retry policies and cleaning up](../running-in-production/running). - [Run resources in parallel, optimize buffers and local storage](../reference/performance.md) + - [Use REST API client helpers](../general-usage/http/rest-client.md) to simplify working with REST APIs. 3. Check out our [how-to guides](../walkthroughs) to get answers to some common questions. 4. Explore the [Examples](../examples) section to see how dlt can be used in real-world scenarios diff --git a/docs/website/docs/tutorial/load-data-from-an-api.md b/docs/website/docs/tutorial/load-data-from-an-api.md index 4c2827d0cf..ec6136b6d3 100644 --- a/docs/website/docs/tutorial/load-data-from-an-api.md +++ b/docs/website/docs/tutorial/load-data-from-an-api.md @@ -150,9 +150,9 @@ and `updated_at.last_value` to tell GitHub to return issues updated only **after ## Using pagination helper -In the previous examples, we used the `requests` library to make HTTP requests to the GitHub API and handled pagination manually. `dlt` provides a helper function `paginate` that simplifies this process. The `paginate` function takes a URL and optional parameters (quite similar to `requests`) and returns a generator that yields pages of data. +In the previous examples, we used the `requests` library to make HTTP requests to the GitHub API and handled pagination manually. `dlt` has the built-in [REST client](../general-usage/http/rest-client.md) that simplifies API requests. We'll pick the `paginate()` helper from it for the next example. The `paginate` function takes a URL and optional parameters (quite similar to `requests`) and returns a generator that yields pages of data. -Let's rewrite the previous example using the `paginate` helper: +Here's how the updated script looks: ```py import dlt @@ -191,6 +191,12 @@ print("------") print(load_info) ``` +Let's zoom in on the changes: + +1. The `while` loop that handled pagination is replaced with reading pages from the `paginate()` generator. +2. `paginate()` takes the URL of the API endpoint and optional parameters. In this case, we pass the `since` parameter to get only issues updated after the last pipeline run. +3. We're not explicitly setting up pagination, `paginate()` handles it for us. Magic! Under the hood, `paginate()` analyzes the response and detects the pagination method used by the API. Read more about pagination in the [REST client documentation](../general-usage/http/rest-client.md#paginating-api-responses). + ## Next steps Continue your journey with the [Resource Grouping and Secrets](grouping-resources) tutorial.