# Azure ML Dataset
The following code illustrate how to create dataset for AML experiment consumption.

There are 3 type of dataset in AML:

1. Table
2. File
3. Folder


More details here: https://learn.microsoft.com/en-us/azure/machine-learning/how-to-create-data-assets?view=azureml-api-2&tabs=cli

## MLTable
This is the most advanced options, besides the data, it also provides advanced capability such as versioning, schema etc.

There are many ways to create MLTable, which will illustate below.


#### MLTable - CSV
The following sample code illustrate how to create MLTable from CSV, whether it's from local file, or from remote location.

In [None]:
!pip install -r requirements.txt

In [None]:
subscription_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
resource_group = "RESOURCE_GROUP_NAME"
workspace = "WORKSPACE_NAME"

In [None]:
# Download from Azure Sample and display the output

import mltable
from mltable import MLTableHeaders, MLTableFileEncoding, DataType

# create paths to the data files
paths = [{"file": "wasbs://data@azuremlexampledata.blob.core.windows.net/titanic.csv"}]

# create an MLTable from the data files
tbl = mltable.from_delimited_files(
    paths=paths,
    delimiter=",",
    header=MLTableHeaders.all_files_same_headers,
    infer_column_types=True,
    include_path_column=False,
    encoding=MLTableFileEncoding.utf8,
)

# Set the column types manually
# We can use the DataType.to_*() functions to convert the column types
"""
column_types = {
    "PassengerId": DataType.to_int(),
    "Survived": DataType.to_int(),
    "Pclass": DataType.to_int(),
    "Name": DataType.to_string(),
    "Sex": DataType.to_string(),
    "Age": DataType.to_int(),
    "SibSp": DataType.to_int(),
    "Parch": DataType.to_int(),
    "Ticket": DataType.to_string(),
    "Fare": DataType.to_float(),
    "Cabin": DataType.to_string(),
    "Embarked": DataType.to_string(),
}
tbl = tbl.convert_column_types(column_types)
"""

# filter out rows undefined ages
tbl = tbl.filter("col('Age') > 0")

# drop PassengerId
tbl = tbl.drop_columns(["PassengerId"])

# ensure survived column is treated as boolean
data_types = {
    "Survived": DataType.to_bool(
        true_values=["True", "true", "1"], false_values=["False", "false", "0"]
    )
}
tbl = tbl.convert_column_types(data_types)

# show the first 5 records
tbl.show(5)

In [None]:
# Optional - load table into pandas

df = tbl.to_pandas_dataframe()
df.head(5)

In [None]:
# Save data loading steps - transformation done above
# The above dropped columns and filtered the dataset

tbl.save("./titanic")

In [None]:
# Verify MLTable is created

with open("./titanic/MLTable", "r") as f:
    print(f.read())

In [None]:
# Reproduce the data loading steps and load into tbl

import mltable

# load the previously saved MLTable file
tbl = mltable.load("./titanic/")

In [None]:
# Ceate MLTable
import time
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
from azure.identity import DefaultAzureCredential

# set the version number of the data asset to the current UTC time
VERSION = time.strftime("%Y.%m.%d.%H%M%S", time.gmtime())

# connect to the AzureML workspace
ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group, workspace
)

my_data = Data(
    path="./titanic",
    type=AssetTypes.MLTABLE,
    description="The titanic dataset.",
    name="titanic-cloud-example",
    version=VERSION,
)

ml_client.data.create_or_update(my_data)

In [None]:
# Read the table in development environment

import mltable
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# connect to the AzureML workspace
ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group, workspace
)

# get the latest version of the data asset
# Note: The version was set in the previous code cell.
data_asset = ml_client.data.get(name="titanic-cloud-example", version=VERSION)

# create a table
tbl = mltable.load(f"azureml:/{data_asset.id}")

# load into pandas
df = tbl.to_pandas_dataframe()
df.head(5)

#### MLTable - Parquet
The following shows sample code on how to create MLTable from parquet file.

In [None]:
# Download sample data

import mltable

# glob the parquet file paths for years 2015-19, all months.
paths = [
    {
        "pattern": "wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2015/puMonth=*/*.parquet"
    },
    {
        "pattern": "wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2016/puMonth=*/*.parquet"
    },
    {
        "pattern": "wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2017/puMonth=*/*.parquet"
    },
    {
        "pattern": "wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2018/puMonth=*/*.parquet"
    },
    {
        "pattern": "wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2019/puMonth=*/*.parquet"
    },
]

# create a table from the parquet paths
tbl = mltable.from_parquet_files(paths)

# table a random sample
tbl = tbl.take_random_sample(probability=0.001, seed=735)

# filter trips with a distance > 0
tbl = tbl.filter("col('tripDistance') > 0")

# Drop columns
tbl = tbl.drop_columns(["puLocationId", "doLocationId", "storeAndFwdFlag"])

# Create two new columns - year and month - where the values are taken from the path
tbl = tbl.extract_columns_from_partition_format("/puYear={year}/puMonth={month}")

# print the first 5 records of the table as a check
tbl.show(5)

In [None]:
# serialize the above data loading steps into an MLTable file
tbl.save("./nyc_taxi")

In [None]:
# View the MLTable
with open("./nyc_taxi/MLTable", "r") as f:
    print(f.read())

In [None]:
# Reproduce the data loading steps and load into tbl

import mltable

# load the previously saved MLTable file
tbl = mltable.load("./nyc_taxi/")

In [None]:
# Creation of MLTable

import time
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
from azure.identity import DefaultAzureCredential

# set the version number of the data asset to the current UTC time
VERSION = time.strftime("%Y.%m.%d.%H%M%S", time.gmtime())

# connect to the AzureML workspace
ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group, workspace
)

my_data = Data(
    path="./nyc_taxi",
    type=AssetTypes.MLTABLE,
    description="A random sample of NYC Green Taxi Data between 2015-19.",
    name="green-quickstart",
    version=VERSION,
)

ml_client.data.create_or_update(my_data)

In [None]:
# Read the table in development environment

import mltable
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# connect to the AzureML workspace
ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group, workspace
)

# get the latest version of the data asset
# Note: The version was set in the previous code cell.
data_asset = ml_client.data.get(name="green-quickstart", version=VERSION)

# create a table
tbl = mltable.load(f"azureml:/{data_asset.id}")

tbl.show(5)