# Registering a dataset

This notebook demonstrates how to register a dataset, so we can use Azure version control for it.


## Imports

In [9]:
import os
from azureml.core import Workspace, Dataset
import scripts.exp_resources as exp

## The data

Here we run some ETL to get the data in the shape we need for the upcoming experiment.

In [8]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand-training-true.data'
df = exp.dataset_etl(url)
df.columns

Shape:  (25009, 11)
Missing data:
 1      0
10     0
1.1    0
11     0
1.2    0
13     0
1.3    0
12     0
1.4    0
1.5    0
9      0
dtype: int64


Index(['Rank_1', 'Rank_2', 'Rank_3', 'Rank_4', 'Rank_5', 'class', 'Suit_1_2',
       'Suit_1_3', 'Suit_1_4', 'Suit_2_2', 'Suit_2_3', 'Suit_2_4', 'Suit_3_2',
       'Suit_3_3', 'Suit_3_4', 'Suit_4_2', 'Suit_4_3', 'Suit_4_4', 'Suit_5_2',
       'Suit_5_3', 'Suit_5_4'],
      dtype='object')

In [10]:
df.head()

Unnamed: 0,Rank_1,Rank_2,Rank_3,Rank_4,Rank_5,class,Suit_1_2,Suit_1_3,Suit_1_4,Suit_2_2,...,Suit_2_4,Suit_3_2,Suit_3_3,Suit_3_4,Suit_4_2,Suit_4_3,Suit_4_4,Suit_5_2,Suit_5_3,Suit_5_4
0,11,13,10,12,1,9,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
1,12,11,13,10,1,9,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
2,10,11,1,13,12,9,0,0,1,0,...,1,0,0,1,0,0,1,0,0,1
3,1,13,12,11,10,9,0,0,1,0,...,1,0,0,1,0,0,1,0,0,1
4,2,4,5,3,6,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
folder = 'local_data'
os.makedirs(folder, exist_ok=True)
path = os.path.join(folder, 'poker_dataset.csv')
df.to_csv(path, index=False)

## Azure ML - the DataStorage

In [15]:
# get workspace
ws = Workspace.from_config()

# get storage
default_store = ws.get_default_datastore()

# upload data
default_store.upload_files(
    files=[path],
    target_path='poker_data/',
    overwrite=True,
    show_progress=True
)

Uploading an estimated of 1 files
Uploading local_data\poker_dataset.csv
Uploaded local_data\poker_dataset.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_f368ed6e0eab4a8884e1da4e9d392fb2

## Azure ML - the DataSet

In [17]:
# Create tabular dataset from csv
tab_data = Dataset.Tabular.from_delimited_files(path=(default_store, 'poker_data/*.csv'))

# Display the first 10 rows as a Pandas dataframe
tab_data.take(10).to_pandas_dataframe()



Unnamed: 0,Rank_1,Rank_2,Rank_3,Rank_4,Rank_5,class,Suit_1_2,Suit_1_3,Suit_1_4,Suit_2_2,...,Suit_2_4,Suit_3_2,Suit_3_3,Suit_3_4,Suit_4_2,Suit_4_3,Suit_4_4,Suit_5_2,Suit_5_3,Suit_5_4
0,11,13,10,12,1,9,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
1,12,11,13,10,1,9,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
2,10,11,1,13,12,9,0,0,1,0,...,1,0,0,1,0,0,1,0,0,1
3,1,13,12,11,10,9,0,0,1,0,...,1,0,0,1,0,0,1,0,0,1
4,2,4,5,3,6,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,9,12,10,11,13,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,2,3,4,5,8,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
7,5,6,9,7,8,8,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
8,1,4,2,3,5,8,0,0,1,0,...,1,0,0,1,0,0,1,0,0,1
9,1,1,9,5,3,1,0,0,0,1,...,0,0,1,0,0,0,0,1,0,0


## Register the Dataset

This is where AzureML will control datasets versions.

In [18]:
# Register
tab_data = tab_data.register(
    workspace=ws,
    name='poker_ds',
    description='poker data from  UCI',
    tags={'format':'csv'},
    create_new_version=True
)


In [19]:
# list datasets
print("Datasets:")
for dataset_name in list(ws.datasets.keys()):
    dataset = Dataset.get_by_name(ws, dataset_name)
    print("\t", dataset.name, 'version', dataset.version)


Datasets:
	 poker_ds version 1
