# Taxonomy

The first step of the script is creating a JSON to keep the taxonomy centralized and up-to-date.

In [1]:
import json 

import pandas as pd

from collections import OrderedDict

In [16]:
def get_column(src, column=1):
    return list(pd.read_csv(src, sep=',', header='infer').iloc[:, column])

In [17]:
HEADER = ['nameSeq', 'id', 'fullEN', 'shortEN', 'inWritingCZ', 'abrevCZ', 'descriptiveCZ', 'descriptiveEN']

In [28]:
def filter_taxonomy(data=dict(), names={}):   
    """
    Filter taxonomy using tag names

    Args:
        data (dict): taxonomy
    """
    return dict(filter(lambda i:i[1].get('nameSeq') in names, data.items()))

def get_tags_from_taxonomy(data=dict()):
    """
    Get list of tags from taxonomy

    Args:
        data (dict): taxonomy
    """
    return list(data.keys())

def get_text_from_taxonomy(data=dict(), tag=None, descriptor='id'):
    return data.get(tag).get(descriptor)    

In [None]:
SRC = '/Users/g4brielvs/Desktop/drive-download-20180221T143422Z-001/n_tertiaryData_content_20180220.csv'

d = OrderedDict()

for i, tag in enumerate(tags):    
    
    df = pd.read_csv(SRC, sep=',')
    df
    c = df[(df['nameSeq'] == i)]
    for _, row in c.iterrows():
        print(row['id'])

In [7]:
SRC = '/Users/g4brielvs/Desktop/drive-download-20180221T143422Z-001/n_tertiaryData_content_20180220.csv'

data = OrderedDict()

df = pd.read_csv(SRC, sep=',')

for _, row in df.iterrows():    
    tags[row['id']] = OrderedDict(((i, row[i]) for i in HEADER))

In [None]:
try:
    with open('taxonomy.json', mode='w+', encoding='utf-8') as f:
        json.dump(data, f)
        print('Tags successfully exported! \u2714')
except Exception as e:
    raise e

In [18]:
with open('taxonomy.json') as f:
    data = json.load(f, object_pairs_hook=OrderedDict)

In [19]:
from itertools import islice
print(next(islice(data.items(), 0, None)))

('0100-nr', OrderedDict([('nameSeq', 1), ('id', '0100-nr'), ('fullEN', 'NotRelevant'), ('shortEN', 'nr'), ('inWritingCZ', 'Nerelevantní'), ('abrevCZ', 'nr'), ('descriptiveCZ', 'Není relevantní pro výpočet'), ('descriptiveEN', ' či proměnnou')]))


In [24]:
get_tags_from_taxonomy(data)

['0100-nr',
 '0199-na',
 '0101-cso',
 '0102-o2',
 '0103-cd',
 '0200-nr',
 '0299-na',
 '0201-ver',
 '0202-edg',
 '0300-nr',
 '0399-na',
 '0301-pop',
 '0302-vis',
 '0303-tour',
 '0304-pot',
 '0305-get',
 '0306-occ',
 '0307-cap',
 '0400-nr',
 '0499-na',
 '0401-stop',
 '0402-lru',
 '0403-lau2',
 '0404-cells',
 '0405-adm',
 '0406-fun',
 '0500-nr',
 '0599-na',
 '0501-qrt',
 '0502-hr',
 '0503-ngt',
 '0504-mor',
 '0505-aft',
 '0506-evn',
 '0507-day',
 '0508-wk',
 '0509-mth',
 '0510-ses',
 '0511-yr',
 '0600-nr',
 '0699-na',
 '0601-all',
 '0602-res',
 '0603-noc',
 '0604-outc',
 '0605-inc',
 '0606-res03',
 '0607-noc03',
 '0608-outc03',
 '0609-inc11',
 '0610-res00',
 '0611-outc00',
 '0700-nr',
 '0799-na',
 '0700-mom',
 '0701-15lm',
 '0702-60lm',
 '0703-180lm',
 '0704-300lm',
 '0705-300pm',
 '0800-nr',
 '0899-na',
 '0801-1001',
 '0802-1002',
 '0803-1003',
 '0804-1004',
 '0805-1005',
 '0806-1006',
 '0807-1007',
 '0808-1008',
 '0900-nr',
 '0999-na',
 '0901-sng',
 '0902-uni',
 '0903-rep',
 '1000-nr',
