In [None]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator

events_data = []

@dlt.resource(name="events")
def paginated_getter(
        access_token=dlt.secrets.value
):
    client = RESTClient(
        base_url="https://api.github.com/repos/DataTalksClub/data-engineering-zoomcamp/",
        auth=BearerTokenAuth(token=access_token),
        paginator=HeaderLinkPaginator(links_next_key="next")
    )

    for page in client.paginate("events"):
        yield page


for page_data in paginated_getter():
    events_data.append(page_data)
    print(page_data)

In [None]:
import dlt

pipeline = dlt.pipeline(
    pipeline_name="github_data",
    destination="duckdb",
    dataset_name="events"
)

info = pipeline.run(paginated_getter, table_name="events", write_disposition="replace")

In [None]:
print(pipeline.dataset(dataset_type="default").schema.data_table_names())
print(pipeline.dataset(dataset_type="default").schema)

In [None]:
import numpy as np
import pandas as pd
pipeline.dataset(dataset_type="default").events.df()

In [None]:
pipeline.dataset(dataset_type="default").events__payload__pull_request__base__repo__topics.df()

In [None]:
sql = """
select * 
from events e 
join events__payload__pull_request__base__repo__topics c
on e._dlt_id = c._dlt_parent_id
"""

with pipeline.sql_client() as client:
    with client.execute_query(sql) as cursor:
        data = cursor.df()
        
data

In [None]:
print(pipeline.default_schema.to_pretty_yaml())

In [None]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator

@dlt.resource(
    name="events",
    schema_contract={
        "tables": "evolve",
        "columns": "freeze",
        "data_type": "evolve"
    }
)
def paginated_getter(
        access_token=dlt.secrets.value
):
    client = RESTClient(
        base_url="https://api.github.com/repos/DataTalksClub/data-engineering-zoomcamp/",
        auth=BearerTokenAuth(token=access_token),
        paginator=HeaderLinkPaginator(links_next_key="next")
    )

    for page in client.paginate("/events"):
        for record in page:
            yield record
            
contract_pipeline = dlt.pipeline(
    pipeline_name="github_data_contract",
    destination="duckdb",
    dataset_name="events_contract"
)

contract_info = contract_pipeline.run(paginated_getter(), table_name="contract_events", write_disposition="replace")

print(contract_info)

In [None]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator

@dlt.resource(
    name="events"
)
def evolved_paginated_getter(
        access_token=dlt.secrets.value
):
    client = RESTClient(
        base_url="https://api.github.com/repos/DataTalksClub/data-engineering-zoomcamp/",
        auth=BearerTokenAuth(token=access_token),
        paginator=HeaderLinkPaginator(links_next_key="next")
    )

    for page in client.paginate("/events"):
        yield page

evolved_pipeline = dlt.pipeline(
    pipeline_name="github_data_evolved",
    destination="duckdb",
    dataset_name="evolved_events"
)

load_info = evolved_pipeline.run(evolved_paginated_getter(), table_name="events", write_disposition="replace")

for package in load_info.load_packages:
    for table_name, table in package.schema_update.items():
        for column_name, column in table["columns"].items():
            print(column_name, column, table)

In [None]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator

@dlt.resource(
    name="events",
    write_disposition="replace"
)
def paginated_getter(
        access_token=dlt.secrets.value
):
    client = RESTClient(
        base_url="https://api.github.com/repos/DataTalksClub/data-engineering-zoomcamp/",
        auth=BearerTokenAuth(token=access_token),
        paginator=HeaderLinkPaginator(links_next_key="next")
    )

    for page in client.paginate("/events"):
        yield page

pipeline = dlt.pipeline(
    pipeline_name="github_pipeline",
    destination="duckdb",
    dataset_name="events_dataset"
)

load_info = pipeline.run(paginated_getter())

pipeline.dataset(dataset_type="default").events.df()

In [1]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator

@dlt.resource(
    name="events",
    write_disposition="append"
)
def paginated_getter(
        access_token=dlt.secrets.value,
        cursor_date=dlt.sources.incremental(
            "created_at",
            initial_value="2025-06-28"
        )
):
    client = RESTClient(
        base_url="https://api.github.com/repos/DataTalksClub/data-engineering-zoomcamp/",
        auth=BearerTokenAuth(token=access_token),
        paginator=HeaderLinkPaginator(links_next_key="next")
    )

    for page in client.paginate("/events"):
        yield page

pipeline = dlt.pipeline(
    pipeline_name="github_pipeline",
    destination="duckdb",
    dataset_name="events_dataset2"
)

load_info = pipeline.run(paginated_getter)

print(load_info)
pipeline.dataset(dataset_type="default").events.df()

  import pkg_resources


Pipeline github_pipeline load step completed in 0.34 seconds
1 load package(s) were loaded to destination duckdb and into dataset events_dataset2
The duckdb destination used duckdb:////Users/su4ilische/src/datatalks.club/datatalks-de-python-ai/github_pipeline.duckdb location to store data
Load package 1752146025.0642612 is LOADED and contains no failed jobs


Unnamed: 0,id,type,actor__id,actor__login,actor__display_login,actor__gravatar_id,actor__url,actor__avatar_url,repo__id,repo__name,...,payload__pull_request__author_association,payload__pull_request__merged,payload__pull_request__mergeable_state,payload__pull_request__comments,payload__pull_request__review_comments,payload__pull_request__maintainer_can_modify,payload__pull_request__commits,payload__pull_request__additions,payload__pull_request__deletions,payload__pull_request__changed_files
0,51891923624,WatchEvent,182100393,httpmei,httpmei,,https://api.github.com/users/httpmei,https://avatars.githubusercontent.com/u/182100...,419661684,DataTalksClub/data-engineering-zoomcamp,...,,,,,,,,,,
1,51889549884,WatchEvent,58702367,Rajesh-B-Git,Rajesh-B-Git,,https://api.github.com/users/Rajesh-B-Git,https://avatars.githubusercontent.com/u/58702367?,419661684,DataTalksClub/data-engineering-zoomcamp,...,,,,,,,,,,
2,51888295485,ForkEvent,26826719,arturcast,arturcast,,https://api.github.com/users/arturcast,https://avatars.githubusercontent.com/u/26826719?,419661684,DataTalksClub/data-engineering-zoomcamp,...,,,,,,,,,,
3,51886447209,WatchEvent,152605325,Ace1169,Ace1169,,https://api.github.com/users/Ace1169,https://avatars.githubusercontent.com/u/152605...,419661684,DataTalksClub/data-engineering-zoomcamp,...,,,,,,,,,,
4,51883889505,WatchEvent,77841911,maureen-githaiga,maureen-githaiga,,https://api.github.com/users/maureen-githaiga,https://avatars.githubusercontent.com/u/77841911?,419661684,DataTalksClub/data-engineering-zoomcamp,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,51448186304,ForkEvent,75666702,justin2268,justin2268,,https://api.github.com/users/justin2268,https://avatars.githubusercontent.com/u/75666702?,419661684,DataTalksClub/data-engineering-zoomcamp,...,,,,,,,,,,
279,51447915485,WatchEvent,93468890,KungChinYeung,KungChinYeung,,https://api.github.com/users/KungChinYeung,https://avatars.githubusercontent.com/u/93468890?,419661684,DataTalksClub/data-engineering-zoomcamp,...,,,,,,,,,,
280,51447906875,ForkEvent,101667042,Sdossy,Sdossy,,https://api.github.com/users/Sdossy,https://avatars.githubusercontent.com/u/101667...,419661684,DataTalksClub/data-engineering-zoomcamp,...,,,,,,,,,,
281,51447712126,WatchEvent,82459963,smilewendy7,smilewendy7,,https://api.github.com/users/smilewendy7,https://avatars.githubusercontent.com/u/82459963?,419661684,DataTalksClub/data-engineering-zoomcamp,...,,,,,,,,,,


In [4]:
import dlt
from dlt.sources.sql_database import sql_database

source = sql_database(
    "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
    table_names=["family",]
)

pipeline = dlt.pipeline(
    pipeline_name="sql_database_example",
    destination="duckdb",
    dataset_name="sql_data",
    dev_mode=True
)

load_info = pipeline.run(source)
print(load_info)

Pipeline sql_database_example load step completed in 0.71 seconds
1 load package(s) were loaded to destination duckdb and into dataset sql_data_20250710020433
The duckdb destination used duckdb:////Users/su4ilische/src/datatalks.club/datatalks-de-python-ai/sql_database_example.duckdb location to store data
Load package 1752156273.089791 is LOADED and contains no failed jobs
