In [38]:
from omnipy import (HttpUrlDataset,
                    HttpUrlModel,
                    JsonDataset,
                    LinearFlowTemplate,
                    load_urls_into_new_dataset,
                    async_load_urls_into_new_dataset,
                    PandasDataset,
                    runtime,
                    StrDataset,
                    TaskTemplate)

runtime.config.data.terminal_size_lines = 500
runtime.config.data.terminal_size_columns = 100


In [13]:
GOOGLE_API_KEY = 'PUT YOU API KEY HERE'
# To get a Google Sheets API key, follow the instructions e.g. here: 
# https://handsondataviz.org/google-sheets-api-key.html

In [14]:

@TaskTemplate
def generate_google_spreadsheet_url(title: str, spreadsheet_id: str) -> HttpUrlDataset:
    url = HttpUrlModel('https://sheets.googleapis.com/')
    url.path // 'v4' // 'spreadsheets' // spreadsheet_id
    url.query['key'] = GOOGLE_API_KEY
    return HttpUrlDataset({title: url})


@TaskTemplate
def extract_titles_from_google_spreadsheet(data: JsonDataset) -> StrDataset:
    titles = [sheet['properties']['title'] for sheet in data[0]['sheets']]
    return StrDataset(zip(titles, titles))


@TaskTemplate(iterate_over_data_files=True, output_dataset_cls=HttpUrlDataset)
def generate_google_single_sheet_urls(sheet_title: str) -> HttpUrlModel:
    url = HttpUrlModel('https://sheets.googleapis.com/')
    url.path // 'v4' // 'spreadsheets' // spreadsheet_id // 'values' // sheet_title
    url.query['key'] = GOOGLE_API_KEY
    return url


@TaskTemplate
def convert_spreadsheets_to_pandas(data: JsonDataset) -> PandasDataset:
    return PandasDataset({key: val['values'] for key, val in data[1:].items() if 'values' in val})


In [39]:
# @LinearFlowTemplate(
#     generate_google_spreadsheet_url,
#     load_urls_into_new_dataset,
#     extract_titles_from_google_spreadsheet,
#     generate_google_single_sheet_urls,
#     load_urls_into_new_dataset,
#     convert_spreadsheets_to_pandas,
# )
# def download_all_sheets_from_google_spreadsheet(title: str, spreadsheet_id: str) -> JsonDataset:
#     ...

async def download_all_sheets_from_google_spreadsheet(title: str, spreadsheet_id: str) -> JsonDataset:
    spreadsheet_title = 'FAIRtracks Flattened Schema'
    spreadsheet_id = '1spmdLLZr8XyhUKbSvbt_NETtzA5zpxmusFoP1o1Ppic'
    main_url = generate_google_spreadsheet_url.run(spreadsheet_title, spreadsheet_id)
    dataset = await async_load_urls_into_new_dataset.run(main_url)
    titles = extract_titles_from_google_spreadsheet.run(dataset)
    sheet_urls = generate_google_single_sheet_urls.run(titles)
    sheet_contents = await async_load_urls_into_new_dataset.run(sheet_urls)
    return convert_spreadsheets_to_pandas.run(sheet_contents)

In [40]:
dataset = await download_all_sheets_from_google_spreadsheet(spreadsheet_title, spreadsheet_id)

[OMNIPY ] Thu Nov  7 13:32:31 2024 - INFO: Initialized "task-generate-google-spreadsheet-url-scrupulous-piculet" (omnipy.hub.registry.RunStateRegistry)
[OMNIPY ] Thu Nov  7 13:32:31 2024 - INFO: Started running "task-generate-google-spreadsheet-url-scrupulous-piculet"... (omnipy.hub.registry.RunStateRegistry)
[OMNIPY ] Thu Nov  7 13:32:31 2024 - INFO: Finished running "task-generate-google-spreadsheet-url-scrupulous-piculet"! (omnipy.hub.registry.RunStateRegistry)
[OMNIPY ] Thu Nov  7 13:32:31 2024 - INFO: Writing dataset as a gzipped tarpack to "/Users/sveinugu/PycharmProjects/omnipy/docs/notebooks/outputs/2024_11_07-13_32_31/00_task_generate_google_spreadsheet_url.tar.gz" (omnipy.compute.task.TaskWithMixins)
[OMNIPY ] Thu Nov  7 13:32:31 2024 - INFO: Initialized "task-async-load-urls-into-new-dataset-benevolent-husky" (omnipy.hub.registry.RunStateRegistry)
[OMNIPY ] Thu Nov  7 13:32:31 2024 - INFO: Started running "task-async-load-urls-into-new-dataset-benevolent-husky"... (omnipy.hu

  dataset = await download_all_sheets_from_google_spreadsheet(spreadsheet_title, spreadsheet_id)


In [41]:
dataset

[ASYNCIO] Thu Nov  7 13:32:35 2024 - ERROR: Unclosed client session
client_session: <omnipy.modules.remote.helpers.RateLimitingClientSession object at 0x104072ad0> (asyncio)
[ASYNCIO] Thu Nov  7 13:32:35 2024 - ERROR: Unclosed client session
client_session: <omnipy.modules.remote.helpers.RateLimitingClientSession object at 0x32c5564d0> (asyncio)


╭─────┬──────────────────┬─────────────┬──────────┬────────────────────╮
│   # │ Data file name   │ Type        │   Length │ Size (in memory)   │
├─────┼──────────────────┼─────────────┼──────────┼────────────────────┤
│   0 │ Schema_New       │ PandasModel │       64 │ 102.2 kB           │
│   1 │ UPLOAD           │ PandasModel │       55 │ 98.4 kB            │
│   2 │ DToL Fields      │ PandasModel │       78 │ 69.6 kB            │
╰─────┴──────────────────┴─────────────┴──────────┴────────────────────╯