In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging
from src.log import logger
logger.setLevel(logging.INFO)

## Directories

- data
   - Data directory. often symlinked to a filesystem with lots of space
  
- data/raw
  - Raw (immutable) hash-verified downloads
  
- data/interim
  - Extracted and interim data representations, such as caches
  
- data/processed
  - The final, cleaned and processed data sets for modeling.


In [3]:
from src.paths import raw_data_path, interim_data_path, processed_data_path


## mbi-cle: 


The Learning Vector Quantization (lvq-pak) project includes a simple Finnish phonetic dataset consisting 20-dimensional Mel Frequency Cepstrum Coefficients (MFCCs) labelled with target phoneme information. Our goal is to explore this dataset, process it into a useful form, and make it a part of a reproducible data science workflow.

In [12]:
dataset_name = 'mbi-cle'     # Naming things: the hardest problem in computer science.

In [13]:
from src.utils import list_dir

In [14]:
list_dir(processed_data_path)


['2020',
 '2016',
 'events-2015.csv',
 'events-2016.csv',
 'events-2020.csv',
 'events-2018.csv',
 '2019',
 '2015',
 'events-2013.csv',
 '2013',
 '2014',
 'events-2014.csv',
 'events-2017.csv',
 '2017',
 '2018',
 'events-2019.csv']

In [18]:
from src.data.datasets import Dataset

In [17]:
mbi_cle = RawDataset(dataset_name)
# lvq_pak.add_url(url="http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar")

Can get help

In [23]:
help(RawDataset.add_url)

Help on function add_url in module src.data.datasets:

add_url(self, url=None, hash_type='sha1', hash_value=None, name=None, file_name=None)
    Add a URL to the file list
    
    hash_type: {'sha1', 'md5', 'sha256'}
    hash_value: string or None
        if None, hash will be computed from downloaded file
    file_name: string or None
        Name of downloaded file. If None, will be the last component of the URL
    url: string
        URL to fetch
    name: str
        text description of this file.



We will use this feature to add a README to our dataset:

In [None]:
mbi_cle.add_url(url='http://www.smn.gov.ar/',
                file_name=f'{dataset_name}.readme',
                name='DESCR')

Datasets should always have an explicit license. Reading the project documentation, we see a license in one of the textfiles.

In [25]:
license_txt = '''
************************************************************************
*                                                                      *
*                              MBI_CLE                                 *
*                                                                      *
*                                                                      *
*                      Copyright (c) 2021                              *
*                                                                      *
************************************************************************
'''

We can extract and use the license via the add_metadata() method:

In [27]:
mbi_cle.add_metadata(contents=license_txt, kind='LICENSE')

In [28]:
mbi_cle.fetch()

Exception: Not implemented yet!

In [30]:
help(mbi_cle.add_file)

Help on method add_file in module src.data.datasets:

add_file(hash_type='sha1', hash_value=None, name=None, *, file_name) method of src.data.datasets.RawDataset instance
    Add a file to the file list.
    
    This file must exist on disk, as there is no method specified for fetching it.
    This is useful when the raw dataset requires an offline procedure for downloading.
    
    hash_type: {'sha1', 'md5', 'sha256'}
    hash_value: string or None
        if None, hash will be computed from specified file
    file_name: string
        Name of downloaded file.
    name: str
        text description of this file.



In [35]:
for file in list_dir(processed_data_path):
    mbi_cle.add_file(hash_type='sha256', name=file, file_name=processed_data_path / file)

In [36]:
mbi_cle.to_dict()

{'url_list': [{'contents': '\n************************************************************************\n*                                                                      *\n*                              MBI_CLE                                 *\n*                                                                      *\n*                                                                      *\n*                      Copyright (c) 2021                              *\n*                                                                      *\n************************************************************************\n',
   'file_name': 'mbi-cle.license',
   'name': 'LICENSE'},
  {'contents': '\n************************************************************************\n*                                                                      *\n*                              MBI_CLE                                 *\n*                                                                      *\n*  