# Overview

The following example shows how to create a DataCard, register it, and then load it/other DataCards for collaboration and reproducibility

In [1]:
import os

os.environ["GOOGLE_ACCOUNT_JSON_BASE64"] = "service account credentials (see pinned chat in slack or use gcloud sdk)"
os.environ["POETRY_HTTP_BASIC_SHIPT_RESOLVE_USERNAME"]="secret username"
os.environ["POETRY_HTTP_BASIC_SHIPT_RESOLVE_PASSWORD"]="secret pass"

In [2]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://steven.forrester:****@artifactory.shipt.com/artifactory/api/pypi/pypi-virtual/simple
Collecting opsml-data==0.1.1rc1673236397
  Downloading https://artifactory.shipt.com/artifactory/api/pypi/pypi-virtual/opsml-data/0.1.1rc1673236397/opsml_data-0.1.1rc1673236397-py3-none-any.whl (45 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.9/45.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m






Installing collected packages: opsml-data
  Attempting uninstall: opsml-data
    Found existing installation: opsml-data 0.1.1rc1673234251
    Uninstalling opsml-data-0.1.1rc1673234251:
      Successfully uninstalled opsml-data-0.1.1rc1673234251
Successfully installed opsml-data-0.1.1rc1673236397


In [3]:
from opsml_data import DataCard, DataRegistry
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

### Instantiate data registry

In [4]:
registry = DataRegistry()

#### Example 1: Creating data (with index splits) and registering it to the data registry
Create fake data

In [5]:
mu_1, mu_2 = -4, 4
X_data = np.random.normal(mu_1, 2.0, size=(1000, 10))
y_data = np.random.randint(2, 100, size=(1000, 1))

col_names = []
for i in range(0, X_data.shape[1]):
    col_names.append(f"col_{i}")

# Create dataframe
data = pd.DataFrame(X_data, columns=col_names)
data["target"] = y_data
data.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,target
0,-5.160202,-0.025896,-3.485771,-2.555679,-3.203268,-4.299549,-8.064235,-5.154186,-2.356566,-3.020622,35
1,-6.664674,-1.111773,-5.620179,-3.227847,-6.907431,-6.443197,-5.24484,-3.423994,-4.568824,-1.61085,84
2,-6.371372,-1.900902,-3.54821,-4.417654,-7.896266,-7.032659,-4.702275,-5.272811,-4.481945,-3.053206,13
3,-6.679026,-5.66898,-3.287953,-6.197477,-2.40001,-5.50435,-4.598107,-5.332875,-2.99745,-4.690829,21
4,-2.817011,-2.319162,-2.761658,-2.004763,-3.584133,-2.995775,-3.674715,-3.954267,-4.323538,-5.30645,81


##### Create train test splits

In [6]:
train_idx, test_idx = train_test_split(np.arange(data.shape[0]), test_size=0.3)

##### Create DataCard
- check out the docstring for input specifications
- Required arguments:
    - data: pandas dataframe, numpy array, or pyarrow table
    - data_name: Name for the data
    - team: team name
    - user_email: User email

In [7]:
DATA_NAME = "synthetic_data"
TEAM = "SPMS"
USER_EMAIL = "steven.forrester@shipt.com"
DATA_SPLITS = [
    {"label":"train", "indices": train_idx},
    {"label":"test", "indices": test_idx}
]

data_card = DataCard(
    data_name=DATA_NAME, 
    team=TEAM, 
    user_email=USER_EMAIL, 
    data=data, 
    data_splits=DATA_SPLITS
)

# confirm data
data_card.data

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,target
0,-5.160202,-0.025896,-3.485771,-2.555679,-3.203268,-4.299549,-8.064235,-5.154186,-2.356566,-3.020622,35
1,-6.664674,-1.111773,-5.620179,-3.227847,-6.907431,-6.443197,-5.244840,-3.423994,-4.568824,-1.610850,84
2,-6.371372,-1.900902,-3.548210,-4.417654,-7.896266,-7.032659,-4.702275,-5.272811,-4.481945,-3.053206,13
3,-6.679026,-5.668980,-3.287953,-6.197477,-2.400010,-5.504350,-4.598107,-5.332875,-2.997450,-4.690829,21
4,-2.817011,-2.319162,-2.761658,-2.004763,-3.584133,-2.995775,-3.674715,-3.954267,-4.323538,-5.306450,81
...,...,...,...,...,...,...,...,...,...,...,...
995,-4.380297,-3.171645,-0.627990,-1.688472,0.729206,-2.883401,-5.145330,-2.494345,-2.115308,-1.231598,68
996,-6.938969,-1.486126,-2.949069,0.283093,-1.474004,-6.649144,-7.759252,-3.756787,-5.144261,-3.701229,4
997,-5.163608,-6.041067,-3.581658,-6.051015,-5.134530,-1.833917,-2.401548,-6.646110,-3.172974,-4.748680,99
998,-2.501005,-5.510995,-4.818315,-2.451338,-2.146552,-4.271094,0.001651,-4.784271,-5.047539,-5.457438,30


##### Confirm data splits
- data_card will return splits in the form of a pydantic model

In [8]:
splits = data_card.split_data()
splits.train.shape, splits.test.shape

((700, 11), (300, 11))

#### Save data to registry

In [12]:
registry.register_data(data_card=data_card)

{"level": "INFO", "message": "Table: synthetic_data registered as version 3", "timestamp": "2023-01-09T02:57:29.874955Z", "app_env": "staging", "host": null, "version": null}


### Listing DataCards in the registry

In [14]:
# list data
registry_data = registry.list_data(data_name=DATA_NAME, team=TEAM, version=data_card.version)
assert data_card.uid == registry_data["uid"].values[0]

registry_data.head()

Unnamed: 0,uid,date,timestamp,app_env,data_name,team,data_uri,drift_uri,feature_map,data_splits,data_type,version,user_email,dependent_vars
0,d641649db0e64b0a9fcfa9c8178b90ef,2023-01-08,1673232819415,staging,synthetic_data,SPMS,gs://ds-opsml-stg/data_registry/SPMS/synthetic...,,"{'col_0': 'double', 'col_1': 'double', 'col_2'...","{'splits': [{'label': 'train', 'indices': [61,...",DataFrame,3,steven.forrester@shipt.com,


### Loading DataCards from the the registry

In [15]:
# load data_card
new_data_card = registry.load_data(data_name=DATA_NAME, team=TEAM)
assert new_data_card.uid == data_card.uid