In [1]:
import flyte
import flyte.storage

In [2]:
flyte.init_from_config(storage=flyte.storage.S3.auto(region="us-east-2"))

In [4]:
env = flyte.TaskEnvironment(name="pd_data", image=flyte.Image.from_debian_base().with_pip_packages("pandas", "pyarrow"))

In [5]:
import pandas as pd

BASIC_EMPLOYEE_DATA = {
    "employee_id": range(1001, 1009),
    "name": ["Alice", "Bob", "Charlie", "Diana", "Ethan", "Fiona", "George", "Hannah"],
    "department": ["HR", "Engineering", "Engineering", "Marketing", "Finance", "Finance", "HR", "Engineering"],
    "hire_date": pd.to_datetime(
        ["2018-01-15", "2019-03-22", "2020-07-10", "2017-11-01", "2021-06-05", "2018-09-13", "2022-01-07", "2020-12-30"]
    ),
}

In [6]:
@env.task
async def create_raw_dataframe() -> pd.DataFrame:
    """
    This task creates a raw pandas DataFrame with basic employee information.
    This is the most basic use-case of how to pass dataframes (of all kinds, not just pandas). Create the dataframe
    as normal, and return it. Note that the output signature is of the dataframe library type.
    Uploading of the actual bits of the dataframe (which for pandas is serialized to parquet) happens at the
    end of the task, the TypeEngine uploads from memory to blob store.
    """
    return pd.DataFrame(BASIC_EMPLOYEE_DATA)

In [7]:
r = flyte.run(create_raw_dataframe)

In [8]:
r.url

'https://demo.hosted.unionai.cloud/v2/domain/development/project/flytesnacks/runs/rc74pd2b9pxcpjhvxdrf'

In [9]:
r.wait()

Output()

In [10]:
r.inputs()

{}

In [11]:
outputs = r.outputs()

In [12]:
outputs.pb2

literals {
  name: "o0"
  value {
    scalar {
      structured_dataset {
        uri: "s3://union-oc-production-demo/r7/demo/flytesnacks/development/rc74pd2b9pxcpjhvxdrf/a0/1/32/rc74pd2b9pxcpjhvxdrf-a0-0/0ef5920580528992568fc728a4ee6c2b"
        metadata {
          structured_dataset_type {
            format: "parquet"
          }
        }
      }
    }
  }
}

In [13]:
df = outputs[0]

In [None]:
df