In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging
from src.log import logger
logger.setLevel(logging.INFO)

## Directories

- data
   - Data directory. often symlinked to a filesystem with lots of space
  
- data/raw
  - Raw (immutable) hash-verified downloads
  
- data/interim
  - Extracted and interim data representations, such as caches
  
- data/processed
  - The final, cleaned and processed data sets for modeling.


In [42]:
from src.paths import raw_data_path, interim_data_path, processed_data_path, external_data_path


## mbi-cle: 


The Learning Vector Quantization (lvq-pak) project includes a simple Finnish phonetic dataset consisting 20-dimensional Mel Frequency Cepstrum Coefficients (MFCCs) labelled with target phoneme information. Our goal is to explore this dataset, process it into a useful form, and make it a part of a reproducible data science workflow.

In [12]:
dataset_name = 'mbi-cle'     # Naming things: the hardest problem in computer science.

In [13]:
from src.utils import list_dir

In [14]:
list_dir(processed_data_path)


['2020',
 '2016',
 'events-2015.csv',
 'events-2016.csv',
 'events-2020.csv',
 'events-2018.csv',
 '2019',
 '2015',
 'events-2013.csv',
 '2013',
 '2014',
 'events-2014.csv',
 'events-2017.csv',
 '2017',
 '2018',
 'events-2019.csv']

In [18]:
from src.data.datasets import Dataset

In [62]:
mbi_cle = RawDataset(dataset_name)
# lvq_pak.add_url(url="http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar")

Can get help

In [63]:
help(RawDataset.add_url)

Help on function add_url in module src.data.datasets:

add_url(self, url=None, hash_type='sha1', hash_value=None, name=None, file_name=None)
    Add a URL to the file list
    
    hash_type: {'sha1', 'md5', 'sha256'}
    hash_value: string or None
        if None, hash will be computed from downloaded file
    file_name: string or None
        Name of downloaded file. If None, will be the last component of the URL
    url: string
        URL to fetch
    name: str
        text description of this file.



We will use this feature to add a README to our dataset:

In [64]:
# mbi_cle.add_url(url='http://www.smn.gov.ar/',
#                 file_name=f'{dataset_name}.readme',
#                 name='DESCR')

Datasets should always have an explicit license. Reading the project documentation, we see a license in one of the textfiles.

In [65]:
license_txt = '''
************************************************************************
*                                                                      *
*                              MBI_CLE                                 *
*                                                                      *
*                                                                      *
*                      Copyright (c) 2021                              *
*                                                                      *
************************************************************************
'''

We can extract and use the license via the add_metadata() method:

In [49]:
mbi_cle.add_metadata(contents=license_txt, kind='LICENSE')

In [50]:
mbi_cle.fetch()

True

In [68]:
for file in list_dir(external_data_path):
    mbi_cle.add_file(hash_type='sha256', name=file, file_name=external_data_path / file)

In [73]:
mbi_cle.file_list[0]

{'hash_type': 'sha256',
 'hash_value': None,
 'name': 'DMPSmbiocle2019.dat',
 'file_name': PosixPath('/home/gfogwil/Documentos/Trabajo/instrumentos/dmps/data/external/DMPSmbiocle2019.dat')}

In [69]:
for key, value in mbi_cle.to_dict().items() :
    print(key, value, '\n')

url_list [{'hash_type': 'sha256', 'hash_value': None, 'name': 'DMPSmbiocle2019.dat', 'file_name': PosixPath('/home/gfogwil/Documentos/Trabajo/instrumentos/dmps/data/external/DMPSmbiocle2019.dat')}, {'hash_type': 'sha256', 'hash_value': None, 'name': 'DMPSmbiocle2015.dat', 'file_name': PosixPath('/home/gfogwil/Documentos/Trabajo/instrumentos/dmps/data/external/DMPSmbiocle2015.dat')}, {'hash_type': 'sha256', 'hash_value': None, 'name': 'DMPSmbiocle2018.dat', 'file_name': PosixPath('/home/gfogwil/Documentos/Trabajo/instrumentos/dmps/data/external/DMPSmbiocle2018.dat')}, {'hash_type': 'sha256', 'hash_value': None, 'name': 'DMPSmbiocle2014.dat', 'file_name': PosixPath('/home/gfogwil/Documentos/Trabajo/instrumentos/dmps/data/external/DMPSmbiocle2014.dat')}, {'hash_type': 'sha256', 'hash_value': None, 'name': 'DMPSmbiocle2020.dat', 'file_name': PosixPath('/home/gfogwil/Documentos/Trabajo/instrumentos/dmps/data/external/DMPSmbiocle2020.dat')}, {'hash_type': 'sha256', 'hash_value': None, 'name'