# dataPOP

The first step of the script is creating a JSON to keep the taxonomy centralized and up-to-date.

In [1]:
import glob
import json 
import os
import shutil

import pandas as pd

from collections import OrderedDict

In [2]:
def get_column(src, column=1):
    return list(pd.read_csv(src, sep=',', header='infer', converters={'intID': lambda x: str(x)}, encoding='cp1252').iloc[:, column])

def get_header(src):
    df = pd.read_csv(src, sep=',', header='infer', encoding='cp1252')
    return list(df.columns.values)

In [6]:
SRC = '/Users/g4brielvs/Dropbox/Workspace/Names/n_tertiaryData_structure_20180220.csv'

NAMES = get_column(SRC, 1)
NAMES

['dataSource',
 'graphType',
 'graphDetail',
 'spacialAggregation',
 'temporalAggregation',
 'userTclass',
 'trajectoryTimeDefinitionPoint',
 'fixedRoute',
 'calculationMethod',
 'statistics',
 'statisticsDetail',
 'transportMode',
 'demographyGender',
 'demographyAge',
 'dataDetail',
 'controlSum']

In [None]:
def filter_taxonomy(data=dict(), names={}):   
    """
    Filter taxonomy using tag names

    Args:
        data (dict): taxonomy
    """
    return dict(filter(lambda i:i[1].get('nameSeq') in names, data.items()))

def get_tags_from_taxonomy(data=dict()):
    """
    Get list of tags from taxonomy

    Args:
        data (dict): taxonomy
    """
    return list(data.keys())

def get_text_from_taxonomy(data=dict(), tag=None, descriptor='id'):
    return data.get(tag).get(descriptor)    

## Taxonomy

In [None]:
def get_taxonomy_from_file(src):

    data = OrderedDict()
    df = pd.read_csv(src, sep=',', converters={'intID': lambda x: str(x)}, encoding='cp1252')

    for _, row in df.iterrows(): 
        data[row['intID']] = OrderedDict(((i, row[i]) for i in HEADER))
        
    return data

In [1]:
SRC = '/Users/g4brielvs/Dropbox/Workspace/Names/04-nam_tertiaryData_20180307.csv'

HEADER = get_header(SRC)
HEADER

NameError: name 'get_header' is not defined

In [None]:
SRC = '/Users/g4brielvs/Dropbox/Workspace/Names/04-nam_tertiaryData_20180307.csv'

tax = get_taxonomy_from_file(SRC)

In [None]:
try:
    with open('taxonomy.json', mode='w+', encoding='utf-8') as f:
        json.dump(tax, f)
        print('Taxonomy successfully exported! \u2714')
except Exception as e:
    raise e

## Tags

In [None]:
with open('taxonomy.json') as f:
    data = json.load(f, object_pairs_hook=OrderedDict)

In [None]:
tags = OrderedDict()

for i, v in enumerate(NAMES, 1):
        
    l = get_tags_from_taxonomy(filter_taxonomy(data, {i}))
    tags[v] = l

In [66]:
tags = get_tags_from_taxonomy(tax)
tags.sort(key=lambda x: int(x))

In [None]:
with open('tags.json', mode='w+', encoding='utf-8') as f:
    json.dump(tags, f)

## Corrections 

In [62]:
corrections = list()

for i, v in enumerate(NAMES, 1):
    
    tags = get_tags_from_taxonomy(filter_taxonomy(data, {i}))
    
    corrections.append(OrderedDict(zip(map(lambda x: x[-3:], tags), tags)))

In [105]:
corrections = list()

for i, v in enumerate(NAMES, 1):
    
    tags = get_tags_from_taxonomy(filter_taxonomy(data, {i}))

In [None]:
corrections = OrderedDict()

for i, tag in enumerate(tags, 1):
    
    txID = tax.get(tag).get('txID')
    
    corrections[txID] = txID[-4:]

In [None]:
try:
    with open('corrections.json', mode='w+', encoding='utf-8') as f:
        json.dump(corrections, f)
        print('Corrections successfully exported! \u2714')
except Exception as e:
    raise e

## New renaming

In [120]:
def get_valid_filename(filename):
    """
    Checks and apply corrections in order on tags in filename

    Args:
        file (str): pathname
    """

    (name, extension) = os.path.splitext(filename)

    tags_in_filename = name.strip().split('_')
    tags = list()
    
    for i, item in enumerate(NAMES):
        try:
            item_in_title = tags_in_filename[i]
            
            if item_in_title[:4] == '00-n':
                item_in_title = '{:02}99-na'.format(i + 1)
            
        except IndexError:
            item_in_title = '1699-na'
        
        tags.append(item_in_title)
    
    return '_'.join(map(str, tags)) + extension

In [121]:
def check(src, dst):
    """
    Checks source for filenames and makes a valid copy into destination

    Args:
        src (str): path to source
        dst (str): path to destination
    """
    for pathname in glob.iglob(os.path.join(src, '**/*.csv'), recursive=True):
        (dirname, filename) = os.path.split(pathname)

        try:
            new_filename = get_valid_filename(filename)

            #new_dst = os.path.join(dst, os.path.basename(dirname))
            new_dst = dst
            new_file = os.path.join(new_dst, new_filename)
            os.makedirs(new_dst, exist_ok=True)
            shutil.copy(pathname, new_file)

        except Exception as e:
            raise e

## Validation

In [123]:
SRC = '/Users/g4brielvs/Desktop/DATA/'
DST = '/Users/g4brielvs/Desktop/DEST/'

check(SRC, DST)