In [None]:
import pandas as pd
import numpy as np
import hashlib

In [None]:
from traitlets.config.manager import BaseJSONConfigManager
path = "/Users/drorata/.jupyter/nbconfig"
cm = BaseJSONConfigManager(config_dir=path)
cm.update('livereveal', {
              'theme': 'simple',
              'transition': 'zoom',
              'start_slideshow_at': 'selected',
})

# Reproducible DS work

Dr. Dror Atariah, drorata@gmail.com

# Outline

* Challenges in reproducible DS work
* Hashing code *and* data
* Docker to rescue?
* DS project as a Python package

# Challenges

## DS project ingredients (Naive)

* Code
* Data

## DS project ingredients (Reality)

* Code (`git`)
* Notebooks (?)
* Data (?)
* Environment (`virtualenv`, `conda-env`, `docker`?)

# Hashing `pandas.DataFrame` (Naive)

**_Idea_:** Convert to bytes stream and hash

## $\mathbb{R}, \mathbb{N}$ and so on...

In [None]:
np.random.seed(42)
arr = np.random.choice([41, 43, 42], size=(3,3))
df = pd.DataFrame(arr)

In [None]:
np.array_equal(df.values, arr)

In [None]:
print(hashlib.sha256(arr.tobytes()).hexdigest())

In [None]:
print(hashlib.sha256(df.values.tobytes()).hexdigest())

## And strings?

In [None]:
np.random.seed(42)
arr = np.random.choice(['foo', 'bar', 42], size=(3,3))
df = pd.DataFrame(arr)
print(arr)
print(df)
print(hashlib.sha256(arr.tobytes()).hexdigest())
print(hashlib.sha256(df.values.tobytes()).hexdigest())
# NOTE:
print(np.array_equal(arr, df.values))

## Workaround

In [None]:
np.random.seed(42)
arr = np.random.choice(['foo', 'bar', 42], size=(3,3))
df = pd.DataFrame(arr)
print(arr)
print(df)
print(hashlib.sha256(arr.tobytes()).hexdigest())
print(hashlib.sha256(df.values.tobytes()).hexdigest())
print(hashlib.sha256(df.to_json().encode()).hexdigest())
print(hashlib.sha256(df.to_csv().encode()).hexdigest())

## Pythonic solution

As of version 0.20 (and maybe earlier)

In [None]:
from pandas.util import hash_pandas_object

In [None]:
np.random.seed(42)
arr = np.random.choice(['foo', 'bar', 42], size=(3,3))
df = pd.DataFrame(arr)
hash_pandas_object(df)

## Usage example

When persisting a DataFrame, you can use the hash:

In [None]:
from hashlib import sha256

def persist_df(df, path=None, sql=None):
    df_hash = sha256(raw_df.to_json().encode()).hexdigest()
    base_filename = 'raw_df_{}_{}'.format(
        pd.datetime.now().isoformat().replace(":","-").replace(".","-"),
        df_hash)
    if path is not None:
        base_filename = path + base_filename
    raw_df.to_pickle(base_filename+'.pickle')
    with open(base_filename+".sql", "w") as sql_file:
        print(query, file=sql_file)

# Can Docker help?

Jump to [GitHub](https://github.com/drorata/mwe-jupyter-docker)

# DS project as Python package

Minimal example in [GitHub](https://github.com/drorata/minimal-ds-project)

More resources:
    
* [cookiecutter-data-science](https://drivendata.github.io/cookiecutter-data-science/)
* [dataversioncontrol](https://github.com/dataversioncontrol/dvc)
* [MS - Team Data Science Processes](https://github.com/Azure/Microsoft-TDSP)
* [anaconda-enterprise-notebooks](https://www.continuum.io/anaconda-enterprise-notebooks)