# Tagging data

A simple interface to annotate data automatically using the watchdog package.

In [1]:
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from pathlib import Path
import pandas as pd
import yaml

Define the folders where the raw CSV data and the YAML template is stored, as well as the suffix of the observed files.

In [2]:
# adapt accordingly
observed_dir = 'data/raw/'
yaml_template = 'files/yaml_templates/demo.yml'
suffix = '.csv' # mind the dot

Define which operations should be performed when a new file is created in tha raw data folder. Here we simply add a YAML file, based on the YAML template, where the sample user name can be modified while tagging multiple files.

In [3]:
def tag_file(filename):
    # Load the metadata from a yaml template.
    with open(yaml_template, 'rb') as f:
        metadata = yaml.load(f, Loader=yaml.SafeLoader)

    # Enhance the matadatada with additional descriptors
    metadata.setdefault('file', filename)
    # or replace descriptors.
    metadata['sample name'] = 'Joanne Doe'

    # Write an output YAML file.
    outyaml = Path(filename).with_suffix(suffix + '.meta.yml')
    with open(outyaml, 'w') as f:
        yaml.dump(metadata, f)

Create the observer

In [4]:
class NewFileHandler(FileSystemEventHandler):
    def on_created(self, event):
        if Path(event.src_path).suffix == suffix:
            # Print the filename
            print(event.src_path)
            # or parse the filename to a method
            tag_file(event.src_path)

# create an observer
observer = Observer()

# schedule the observer to observe the folder
observer.schedule(NewFileHandler(), path=observed_dir, recursive=False)

# start the observer
observer.start()

Create some data. A YAML file should be saved along with the data.

In [5]:
data = {'x': [1,2,3], 'y':[1,2,3]}
df = pd.DataFrame(data)
df.to_csv('./data/raw/demo_data.csv')

data/raw/demo_data.csv


Stop the observer

In [6]:
observer.stop()

Explore the saved metadata

In [7]:
with open('data/raw/demo_data.csv.meta.yml', 'rb') as f:
    metadata = yaml.load(f, Loader=yaml.SafeLoader)
metadata

{'experiment': 'My first demo experiment.',
 'file': 'data/raw/demo_data.csv',
 'sample name': 'Joanne Doe',
 'user': 'Max Doe'}

The original metadata from the template for comparison.

In [8]:
with open('files/yaml_templates/demo.yml', 'rb') as f:
    metadata = yaml.load(f, Loader=yaml.SafeLoader)
metadata

{'user': 'Max Doe', 'experiment': 'My first demo experiment.'}