# Работа с Datastores и Datasets в Azure ML

## Соединение со своим Workspace

Импорт модулей:

In [1]:
import os
import azureml.core
from azureml.core import Workspace, Experiment, Dataset
from azureml.core.datastore import Datastore
from azureml.data.data_reference import DataReference
from msrest.exceptions import HttpOperationError

# Check core SDK version number
print(f'SDK version: {azureml.core.VERSION}')

SDK version: 1.12.0


Создаем соединение со своим Workspace:

In [2]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, sep = '\n')

ai-in-cloud-workspace
ai-in-cloud-workshop-rg
westeurope


## Просмотр существующих Datastore

In [3]:
# Get the default datastore
default_ds = ws.get_default_datastore()
print(f'Default Datastore name: {default_ds.name}\n')

# Enumerate all datastores, indicating which is the default
for ds_name in ws.datastores:
    print(ds_name, "- Default =", ds_name == default_ds.name)

Default Datastore name: demo_datastore

demo_datastore - Default = True
creditcardfraudstore - Default = False
dogsimagesblob - Default = False
azureml_globaldatasets - Default = False
workspaceblobstore - Default = False
workspacefilestore - Default = False


## Создаем новый Datastore

Установка имени нового Datastore и указание информации об Azure Storage Account, где будет распологаться новый Datastore:

In [4]:
datastore_name = 'demo_datastore'

# Azure Storage Account Info
storage_account_name = 'aiclouddata'
storage_container_name = 'demo-container'
storage_account_key = '<azure_account_key>' # WARN: insert your storage account key here

Создаем Datastore, если он уже не существует:

In [5]:
try:
    new_datastore = Datastore.get(ws, datastore_name)
    print(f'Blob Datastore with name {new_datastore.name} was found!')

except HttpOperationError:
    new_datastore = Datastore.register_azure_blob_container(
        workspace=ws,
        datastore_name=datastore_name,
        account_name=storage_account_name,
        container_name=storage_container_name,
        account_key=storage_account_key)
    print(f'Registered blob datastore with name {new_datastore.name}')

Blob Datastore with name demo_datastore was found!


Получаем информацию о cозданном Datastore:

In [6]:
print(f'Datastore {new_datastore.name} based on {new_datastore.datastore_type} in storage account named {new_datastore.account_name}')

Datastore demo_datastore based on AzureBlob in storage account named aiclouddata


## Загрузка данных из Datastore

Делаем созданный Datastore хранилищем по умолчанию (для удобства дальнейшей работы):

In [9]:
ws.set_default_datastore(new_datastore.name)
ds = ws.get_default_datastore()

print(ds.name)

demo_datastore


Загрузка данных:

In [8]:
ds.upload_files(files=['../data/diabetes_train.csv', '../data/diabetes_test.csv'], # Upload the diabetes csv files in /data
                       target_path='diabetes-data/', # Put it in a folder path in the datastore
                       overwrite=True, # Replace existing files of the same name
                       show_progress=True)

Uploading an estimated of 2 files
Uploading ../data/diabetes_test.csv
Uploading ../data/diabetes_train.csv
Uploaded ../data/diabetes_test.csv, 1 files out of an estimated total of 2
Uploaded ../data/diabetes_train.csv, 2 files out of an estimated total of 2
Uploaded 2 files


$AZUREML_DATAREFERENCE_411e9cdddeda492c9936c4880d13c33d

Зарегистрируем загруженные в Datastore данные, как табличный Dataset:

In [10]:
diabetes_ds = Dataset.Tabular.from_delimited_files(path=(ds, 'diabetes-data/*.csv'))
diabetes_ds

{
  "source": [
    "('demo_datastore', 'diabetes-data/*.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ]
}

## Зарегистрируем Dataset

In [12]:
diabetes_db = diabetes_ds.register(workspace = ws,
                                   name = 'diabetes_db',
                                   description = 'Diabetes Disease Database',
                                   create_new_version = True)

Просмотрим список зарегистрированных Наборов данных:

In [16]:
print('Available datasets:')

for ds in ws.datasets:
    print(f'\t{ds}')

Available datasets:
	diabetes_db
	credit-card-fraud
	covid19-spread-russia
	covid19-spread
	mnist-dataset
	Pima Indians Diabetes Database


## Просмотр Набора данных

Скачаем зарегистрированный набор данных и выведем 10 строк:

In [17]:
diabetes_db_from_azure = ws.datasets.get('diabetes_db')

diabetes_df = diabetes_db_from_azure.to_pandas_dataframe()
diabetes_df.sample(10)

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
12933,1530753,9,115,96,34,445,18.380284,0.476763,23,0
7904,1115104,6,109,50,44,237,41.179536,0.60531,61,1
9306,1539294,6,98,75,48,289,29.545913,0.189791,60,1
14530,1449450,4,81,52,10,217,20.955122,0.269875,34,0
5078,1302648,8,105,61,26,60,28.353221,0.175321,43,1
2393,1935287,2,97,63,47,87,42.528976,0.701525,58,1
13056,1306293,0,104,94,35,140,20.679651,0.278658,33,0
11721,1425744,1,99,70,53,85,22.550511,0.443597,21,1
5476,1623791,5,102,55,56,32,20.895717,0.114489,26,0
7166,1970602,1,172,65,11,69,21.186491,0.260442,23,0
