In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}



<IPython.core.display.Javascript object>

 # Project: Adopt a Drain Data Cleaning
 * Author: James Wilfong, wilfongjt@gmail.com
 ## Limits
 * This script doesn't interface with GitHub or Data.world. That happens in the maintainer/maintainer.Process.ipynb script.
 ## Input
 * The input data is contained in the raw-data/adopt-a-drain folder of this repo. 
 * The cleaning process is initiated by running this Jupyter Notebook.
 * The cleaned data is put in the clean-data folder of this repo.
 ## Process
 * load dataset(s) from raw-data/adopt-a-drain folder
 * create source id for temporary use, drop later
 * create an asset number from facillity id
 * create a sync id,  
 * ensure input columns are the expected columns
 * remove outliers
 * concat multiple datasets, Make on big dataset from one or many
 * remove duplicate facility ids (keep the first of duplicates)
 * save combined datasets to clean-data/adopt-a-drain/clean.csv
 ## Output
 * Clean data is output to the clean-data/adopt-a-drain folder of the repo.
 ## Next Steps
 * Clean data is pushed to repo by the developer.
 * The developer creates a pull request for the maintainer.
 * The Maintainer will complete the task of loading data into production

# IMPORTANT
## Configuring the Data Transfer
Configure before running "RUN All" in the Cell menu

In [2]:
import helper
'''
    Input: CSV file in raw_data/ folder
    Process: clean (conform, condence)
    Output: is directed to the clean-data/ folder
    
    Name the output file using OUTPUT_FILE
    OUTPUT_FILELOCAL_CLEAN_NAME is used to name the data.world table
    Table names should start with letter, may contain letters, numbers, underscores
    
'''


#INPUT_FILE_NAME='KentCoCatchBasinWatershedJoin_Filtered.csv' # input file is found in raw-data/ folder
#OUTPUT_FILE_NAME='KentCoCatchBasinWatershedJoin_Filtered.csv' # output file is found in clean-data/ folder. Name is used as data.world table name

#INPUT_FILE_NAME='grb_drains.csv' # input file is found in raw-data/ folder
#OUTPUT_FILE_NAME='grb_drains.csv' # output file is found in clean-data/ folder. Name is used as data.world table name

#OUTPUT_FILE_NAME='clean.csv'
OUTPUT_FILE_NAME='{}-{}.csv'.format('grb_drains', helper.get_daystamp(),'csv')
#OUTPUT_FILE_NAME='{}.csv'.format('grb_drains')
COPY_FILE_NAME='grb_drains.csv'
print('OUTPUT_FILE_NAME: ', OUTPUT_FILE_NAME)
gh_file_type = 'csv'
title = 'GRB Storm Drains' # title of data.world dataset. title is also used to name d.w data service
desc = 'Storm Drains of the Grand River Basin, Michigan' # describes contents of dataset, appears in d.w  
table_name=OUTPUT_FILE_NAME.replace('.csv','')
# write the maintainer_config to a file for the maintainer
helper.exportMaintainerConfig(OUTPUT_FILE_NAME, gh_file_type, title, desc, table_name)    


OUTPUT_FILE_NAME:  grb_drains-2019-04-05.csv
kill log:  ./maintainer/maintainer-config.json


In [3]:
'''
    Assemble Names of:
        Application,
        Raw data file,
        Clean data file
'''
#LOCAL_RAW_FILE = helper.get_raw_data_folder() + '/{}'.format(INPUT_FILE_NAME )
#LOCAL_CLEAN_FILE = helper.get_clean_data_folder() + '/{}'.format(OUTPUT_FILE_NAME)
local_config = { 
                 "app_name": helper.get_app_name(),
                 "local_raw": None,
                 "local_clean": None
               }

'''
    ------------- configure outliers
'''
_outliers = {
  'outliers': [
    {'column':'dr_lon',
     'range':(-90.0, -80.0),
     'reason':'Remove {} observations too far west or east.',
     'count': 0
    },  
    {'column':'dr_lat',
     'range':(40.0, 50.0),
     'reason':'Remove {} observations too far north or south.',
     'count': 0
    }
  ]
}  
ENV_ERROR=False
#print('Local_RAW_FILE: ', LOCAL_RAW_FILE)
#print('LOCAL_CLEAN_FILE: ',LOCAL_CLEAN_FILE )

In [4]:
# %matplotlib notebook
from IPython.display import display, HTML
from IPython.display import Markdown

from lib.p3_ProcessLogger import ProcessLogger
cell_log = ProcessLogger() 

In [5]:
cell_log.clear()
#import interface
cell_log.collect('* Import third party packages')

import numpy as np 
import pandas as pd

import pprint

import csv # read and write csv files

from pprint import pprint

import os
# import subprocess

# convenience functions -- cleaning
cell_log.collect('* Import custom packages')
from lib.p3_CellCounts import CellCounts
import lib.p3_clean as clean
from lib.p3_configuration import get_configuration
import lib.p3_explore as explore
import lib.p3_gather as gather # gathering functions
# import lib.p3_helper_functions as helper
import lib.p3_map as maps

if ENV_ERROR:
    cell_log.collect("# Script Failure!!")
    cell_log.collect("# !!! Missing Environment Variables !!!")
    cell_log.collect("### see [Environment Variable Setup](#env-setup)")

Markdown('''{}'''.format(cell_log.getMarkdown()))

* Import third party packages
* Import custom packages

In [6]:

def inferName(col_name):
    names = { 
        "subtype": "dr_subtype",
        "jurisdicti": "dr_jurisdiction",
        "drain__owner": "dr_owner",
        "owner":"dr_owner",
        "local__id": "dr_local_id",
        "facilityid": "dr_facility_id",
        "drain__jurisdiction": "dr_jurisdiction",
        "subwatershed": "dr_subwatershed",
        "subbasin": "dr_subwatershed",
        "point__x":"dr_lon", 
        "long": "dr_lon",
        "point__y":"dr_lat",
        "lat":"dr_lat",
        "soure__id": "del_source_id"}
    
    if not col_name in names:
        #return names[col_name]
        #print('Undefined ', col_name)
        #raise Exception('Undefined {} in inferName(col_name)'.format(col_name))
        return 'del_{}'.format(col_name)
    return names[col_name]
    
# print('inferName: ', inferName('local__id') )
# print('inferName: ', inferName('xxxx') )



In [7]:
def getColumnDict(df_source):
    
    col_dict = {}
    for nm in df_source.columns.values:
        col_dict[nm]=inferName(nm)
        
    #print('getColumnDict: ', col_dict)    
    return col_dict

def validateOutputColumns(df_source):

    #expected_lst = ['dr_subtype', 'dr_jurisdiction', 'dr_owner', 'dr_local_id', 'dr_facility_id', 'dr_subwatershed', 'dr_lon', 'dr_lat', 'dr_asset_id', 'dr_type', 'dr_sync_id']
    expected_lst = ['dr_subtype', 'dr_jurisdiction', 'dr_owner', 'dr_subwatershed', 'dr_lon', 'dr_lat', 'dr_asset_id', 'dr_type', 'dr_sync_id']
    
    #print('expected_lst: ',expected_lst)
    #print('actual:       ',df_source.columns.values)
    
    for nm in df_source.columns.values:
        if not nm in expected_lst:   
            raise Exception('{} is unexpected output for clean data'.format(nm))
            
def validateInputColumns(df_source, source_file_name):   
    #expected_lst = ['del_source_id','dr_subtype', 'dr_jurisdiction', 'dr_owner', 'dr_local_id', 'dr_facility_id', 'dr_subwatershed', 'dr_lon', 'dr_lat', 'dr_asset_no', 'dr_type', 'dr_sync_id']
    #expected_lst = ['dr_subtype', 'dr_jurisdiction', 'dr_owner', 'dr_local_id', 'dr_facility_id', 'dr_subwatershed', 'dr_lon', 'dr_lat', 'dr_asset_no', 'dr_type', 'dr_sync_id']
    #expected_lst = ['dr_subtype', 'dr_jurisdiction', 'dr_owner', 'dr_local_id', 'dr_facility_id', 'dr_subwatershed', 'dr_lon', 'dr_lat', 'dr_asset_id', 'dr_type', 'dr_sync_id']
    #expected_lst = ['dr_subtype', 'dr_jurisdiction', 'dr_owner', 'dr_local_id', 'dr_facility_id', 'dr_subwatershed', 'dr_lon', 'dr_lat', 'dr_asset_id', 'dr_type']
    expected_lst = ['dr_subtype', 'dr_jurisdiction', 'dr_owner', 'dr_subwatershed', 'dr_lon', 'dr_lat', 'dr_asset_id', 'dr_type']



    for nm in expected_lst:
        if not nm in df_source.columns.values:
            raise Exception('{} is missing from {}'.format(nm, source_file_name))      

    
def regional_codes( df_source , _owner):
    '''
    code are added over time. this method checks and throws error not found.
    fix by adding new jurisdiction and code to list below
    '''
    #print('regional_code 1')
    rc = []
    codes = {
        "Kent County Road Commission": "KCRC",
        "KENT COUNTY ROAD COMMISSION":"KCRC",
        "City of East Grand Rapids": "EGR",
        "City of Grandville": "GRANDV",
        "City of Wyoming": "CWY",
        "City of Kentwood": "CK",
        "Grand Rapids Township": "GRTWP",
        "City of Walker": "CW",
        "CGR": "CGR",
        "City of Grand Rapids": "CGR",
        "Georgetown Township": "GTWP",
        "City of Hudsonville": "CHV",
        "Jamestown Township": "JTTWP",
        "Cascade Township": "CASTWP",
        "Algoma Township": "ALGTWP",
        "Grattan Township": "GRATWP",
        "Gaines Township": "GAITWP",
        "Vergennes Township": "VERTWP",
        "Lowell Township": "LOWTWP",
        "Oakfield Township": "OAKTWP",
        "Cannon Township": "CANTWP",
        "Sparta Township": "SPATWP",
        "Solon Township": "SOLTWP",
        "Ada Township": "ADATWP",
        "City of Lowell": "CLO",
        "Bowne Township": "BOWTWP",
        "Tyrone Township": "TYRTWP",
        "Caledonia Township": "CALTWP",
        "Courtland Township": "COUTWP",
        "Spencer Township": "SPETWP",
        "Village of Sparta": "VSP",
        "BYRON TOWNSHIP": "BYRTWP",
        "CALEDONIA TOWNSHIP": "CALETWP",
        "City of Rockford": "CRF",
        "Alpine Township": "ALPTWP",
        "Plainfield Township": "PLATWP",
        "Byron Township": "BYRTWP",
        "OCWRC": "OCWRC",
        "City of Grand Haven DPW":"CGH"
    }

    for jur in df_source[_owner]:
        
        if jur in codes:
            rc.append(codes[jur])
        else:
            #raise Exception('Regional-Code for ({}) is not available... add new '.format(jur)) 
            rc.append('XXX')
    
        
    return rc
    
    
'''

def regional_codes( df_source , _owner, _facility_id , _local_id ):
    
    #code are added over time. this method checks and throws error not found.
    #fix by adding new jurisdiction and code to list below
    
    #print('regional_code 1')
    rc = []
    codes = {
        "Kent County Road Commission": "KCRC",
        "KENT COUNTY ROAD COMMISSION":"KCRC",
        "City of East Grand Rapids": "EGR",
        "City of Grandville": "GRANDV",
        "City of Wyoming": "CWY",
        "City of Kentwood": "CK",
        "Grand Rapids Township": "GRTWP",
        "City of Walker": "CW",
        "CGR": "CGR",
        "City of Grand Rapids": "CGR",
        "Georgetown Township": "GTWP",
        "City of Hudsonville": "CHV",
        "Jamestown Township": "JTTWP",
        "Cascade Township": "CASTWP",
        "Algoma Township": "ALGTWP",
        "Grattan Township": "GRATWP",
        "Gaines Township": "GAITWP",
        "Vergennes Township": "VERTWP",
        "Lowell Township": "LOWTWP",
        "Oakfield Township": "OAKTWP",
        "Cannon Township": "CANTWP",
        "Sparta Township": "SPATWP",
        "Solon Township": "SOLTWP",
        "Ada Township": "ADATWP",
        "City of Lowell": "CLO",
        "Bowne Township": "BOWTWP",
        "Tyrone Township": "TYRTWP",
        "Caledonia Township": "CALTWP",
        "Courtland Township": "COUTWP",
        "Spencer Township": "SPETWP",
        "Village of Sparta": "VSP",
        "BYRON TOWNSHIP": "BYRTWP",
        "CALEDONIA TOWNSHIP": "CALETWP",
        "City of Rockford": "CRF",
        "Alpine Township": "ALPTWP",
        "Plainfield Township": "PLATWP",
        "Byron Township": "BYRTWP",
        "OCWRC": "OCWRC",
        "City of Grand Haven DPW":"CGH"
    }

    #print('regional_code 2 {}'.format(jurisdiction))
    
    #for jur in df_source[fld]:
    #    #print('jur', jur)
    #    if jur in codes:
    #        rc.append(codes[jur])
    #    else:
    #        #raise Exception('Regional-Code for ({}) is not available... add new '.format(jur)) 
    #        rc.append('XXX_')
    
    for index, row in df_source.iterrows():
        if row[_owner] in codes:
            
            val = row[_local_id]
            key = '{}-{}'.format( codes[row[_owner]], val )
            #if pd.isna(row['FACILITYID']):
            #    print('na ', pd.isna(row['FACILITYID']),row['FACILITYID'])
                
            #if (not pd.isna(row['FACILITYID'])) or (not pd.isnull(row['FACILITYID'])) or (row['FACILITYID']==''):
            if not pd.isna(row[_facility_id]):
                val = row[_facility_id]
                key = '{}-{}'.format( codes[row[_owner]], val )
                    
            rc.append( key )
        else:
            #raise Exception('Regional-Code for ({}) is not available... add new '.format(jur)) 
            rc.append('XXX_')
    
    print('regional_code out')
    return rc
'''    

def fixFacilityId(df_source,_facility_id , _local_id):
    rc = []
    for index, row in df_source.iterrows():
        
        val = 0
        if not pd.isna(row[_facility_id]):
            val = row[_facility_id]
        else:    
            if not pd.isna( row[_local_id] ):
                if isinstance(row[_local_id], int):
                    val = row[_local_id]
                
        rc.append(val)
        
    return rc

def f_facility_id(x, y):

    if not pd.isna(x):
        return x
    if not pd.isna(y):
        if isinstance(y, int):
            return y
    return 0
        

## Wrangling Script

In [8]:
# NEW CELL
# testing 
cell_log.clear()
if ENV_ERROR:
    cell_log.collect("# Script Failure!!")
    cell_log.collect("# !!! Missing Environment Variables !!!")
    cell_log.collect("### see [Environment Variable Setup](#env-setup)")
else:
    # get list of raw data files
    print('raw folder ', helper.get_raw_data_folder())

    print(helper.get_raw_files('csv'))
    raw_folder = helper.get_raw_data_folder()
    clean_folder = helper.get_clean_data_folder()
    
    concat_list = []
    #* load data
    #* fix column names
    #* map expected colums to raw-data columns
    #* drop drains without a facility id
    #* fix column types
    
    for in_f in helper.get_raw_files('csv'):
        
        print('-----------------------------------')
        print('raw: ', in_f)
        print('-----------------------------------')
        
        cell_log.collect("")
        cell_log.collect("# CSV Process: What Happened?")
        '''
        --------------------------------- input
        '''
        local_config['local_raw'] = in_f
        local_config['local_clean'] = helper.get_clean_file(in_f)
        
        
        cell_log.collect("* input:  {}".format( local_config["local_raw"]))
        '''
        --------------------------------- load data
        '''
        df_source = helper.open_raw_data(local_config) # open raw-data
        

        cell_log.collect("* input: {} observations".format(len(df_source)))
        cell_log.collect("* input: columns {}".format(df_source.columns.values))
        '''
        --------------------------------- clean column names
        '''
        cell_log.collect('* format: Apply a style of lowercase and underscores to column names.')##############################
        df_source = clean.clean_column_names(df_source) # column names
        '''
        --------------------------------- map expected colums to raw-data columns
        '''
        # pprint( getColumnDict(df_source) )
        df_source = df_source.rename(columns=getColumnDict(df_source))
        '''
        --------------------------------- Drop empty facility id
        '''
        # mark all empties with nan
        df_source['dr_facility_id'] = df_source['dr_facility_id'].apply(lambda x:  np.nan if x != x or x == '' or x == ' ' or x == None else x)
        
        '''
        # empty facility id
        scnt = len(df_source)
        df_source = df_source.dropna(subset=['dr_facility_id'])
        ecnt = len(df_source)
        cell_log.collect("* clean: dropped {} observations with empty dr_facility_id".format(scnt - ecnt))
       
        # empty lat 
        scnt = len(df_source)
        df_source = df_source.dropna(subset=['dr_lon', 'dr_lat'])
        ecnt = len(df_source)
        cell_log.collect("* clean: dropped {} observations with empty dr_lon or dr_lat".format(scnt - ecnt))
        '''
        # make sure facility-id has a numeric value.
        df_source['dr_facility_id'] = df_source[['dr_facility_id','dr_local_id']].apply(lambda x: f_facility_id(*x), axis=1)
        #df_source['dr_facility_id'] = fixFacilityId(df_source,'dr_facility_id' , 'dr_local_id')
        
        df_source['source_code'] = regional_codes( df_source , 'dr_owner')
        
        '''
        --------------------------------- change column types
        '''
        cell_log.collect('* format: convert dr_facility_id column to int64')
        
        df_source['dr_facility_id'] = df_source['dr_facility_id'].astype('int64')
        
        
        '''
        --------------------------------- Patch 
        '''
        df_source['dr_owner'] = df_source['dr_owner'].apply(lambda x: x if x!='CGR' else 'City of Grand Rapids')
        
        '''
        --------------------------------- remove numbers from df_source_id
        '''
        df_source['dr_owner'] = df_source['dr_owner'].apply(lambda x: x if x!='CGR' else 'City of Grand Rapids')
        
        df_source['dr_jurisdiction'] = df_source['dr_owner'] # is what it is
        #df_source['del_source_id'] = df_source['del_source_id'].apply(lambda x: x.split('_')[0] + '_' if isinstance(x, str) else 'XXX_') 
        
       
        # df_source['dr_asset_no'] = df_source['dr_facility_id']
        #df_source['dr_asset_id'] = df_source[['dr_facility_id','dr_local_id']].apply(lambda x: fasset_id(*x), axis=1)
        df_source['dr_asset_id'] = df_source['source_code'] + '_'+ df_source['dr_facility_id'].astype(str)


        #df_source['dr_type'] = df_source['dr_facility_id'].apply(lambda x: 'Storm Water Inlet Drain')
        df_source['dr_type'] = df_source['dr_asset_id'].apply(lambda x: 'Storm Water Inlet Drain')
        
        
        '''
        --------------------------------- create a sync id
        '''
        #df_source['dr_sync_id'] = df_source['del_source_id'] + df_source['dr_facility_id'].astype(str)
      
        #df_source['dr_sync_id'] = regional_codes( df_source,'dr_owner','dr_facility_id' ,'dr_local_id' )
        #df_source['dr_sync_id'] = regional_codes( df_source,'dr_owner','dr_asset_id' ,'dr_local_id' )
        '''
        --------------------------------- Condense
        '''
        #df_source = df_source.drop(['del_source_id'], axis=1)
        df_source = df_source.drop(['del_source'], axis=1)
        df_source = df_source.drop(['del_fid'], axis=1)
        df_source = df_source.drop(['del_gid'], axis=1)
        df_source = df_source.drop(['source_code'], axis=1)
        df_source = df_source.drop(['dr_local_id'], axis=1)
        df_source = df_source.drop(['dr_facility_id'], axis=1)
        
        '''
        --------------------------------- check input cols 
        '''
        validateInputColumns(df_source, local_config["local_raw"])
        
        '''
        --------------------------------- outliers
        '''
        df_source = clean.remove_obvious_outliers(_outliers, df_source)
        for r in _outliers['outliers']:
            cell_log.collect('* outlier: {}'.format(r['reason']))
        '''
        --------------------------------- Concat list
        '''
        concat_list.append(df_source)
    
    
    cell_log.collect("")
    cell_log.collect("# Combined")
    '''
    --------------------------------- concat 
    '''      
    df_source = pd.concat(concat_list)
    
    '''
    --------------------------------- Drop DUPLICATES
    '''
    scnt = len(df_source)
    #df_source = df_source.drop_duplicates('dr_facility_id',keep='first')
    df_source = df_source.drop_duplicates('dr_asset_id',keep='first')
    ecnt = len(df_source)
    cell_log.collect('* duplicates: dropped {} duplicate asset ids'.format(scnt - ecnt))
   
    '''
    --------------------------------- save csv 
    '''
    # assume new file and remove old one
    local_config["local_clean"]='{}/{}'.format(helper.get_clean_data_folder(),OUTPUT_FILE_NAME)

    if os.path.isfile(local_config["local_clean"]):
        os.remove(local_config['local_clean'])
        cell_log.collect('* deleted {} '.format(local_config['local_clean']))

    cell_log.collect("* inter-output: columns {}".format(df_source.columns.values))
    cell_log.collect('* inter-output: {} obs to {}'.format(len(df_source) , local_config["local_clean"]))

    # stop if columns are not expected
    validateOutputColumns(df_source)

    df_source.to_csv(local_config["local_clean"], index=False)


Markdown('''{}'''.format(cell_log.getMarkdown()))

raw folder  /Users/jameswilfong/Documents/Github/CitizenLabs/00-Data-World/04-add-small-version/data.world/raw-data/adopt-a-drain
['/Users/jameswilfong/Documents/Github/CitizenLabs/00-Data-World/04-add-small-version/data.world/raw-data/adopt-a-drain/grb_april_1_3_29_2019.csv']
-----------------------------------
raw:  /Users/jameswilfong/Documents/Github/CitizenLabs/00-Data-World/04-add-small-version/data.world/raw-data/adopt-a-drain/grb_april_1_3_29_2019.csv
-----------------------------------
* clean_column_names: 0.01604294776916504 sec
* remove_obvious_outliers: 0.008656978607177734 sec



# CSV Process: What Happened?
* input:  /Users/jameswilfong/Documents/Github/CitizenLabs/00-Data-World/04-add-small-version/data.world/raw-data/adopt-a-drain/grb_april_1_3_29_2019.csv
* input: 54091 observations
* input: columns ['FID' 'SUBTYPE' 'JURISDICTI' 'OWNER' 'SOURCE' 'LOCAL_ID' 'FACILITYID'
 'Lat' 'Long' 'Subbasin' 'GID']
* format: Apply a style of lowercase and underscores to column names.
* format: convert dr_facility_id column to int64
* outlier: Remove 0 observations too far west or east.
* outlier: Remove 0 observations too far north or south.

# Combined
* duplicates: dropped 10074 duplicate asset ids
* inter-output: columns ['dr_subtype' 'dr_jurisdiction' 'dr_owner' 'dr_lat' 'dr_lon'
 'dr_subwatershed' 'dr_asset_id' 'dr_type']
* inter-output: 44017 obs to /Users/jameswilfong/Documents/Github/CitizenLabs/00-Data-World/04-add-small-version/data.world/clean-data/adopt-a-drain/grb_drains-2019-04-05.csv

In [19]:
from shutil import copyfile
OUTPUT_FILE_NAME
#/Users/jameswilfong/Documents/Github/CitizenLabs/00-Data-World/03-april-data/data.world/clean-data/adopt-a-drain/grb_drains-2019-04-02.csv

ifn = '{}/{}'.format(helper.get_clean_data_folder(), OUTPUT_FILE_NAME)
ofn = '{}/{}'.format(helper.get_clean_data_folder(), COPY_FILE_NAME)

print(ifn)
print(ofn)

copyfile(ifn, ofn)
# set up a smaller version of file
tfn = '{}/{}'.format(helper.get_test_version_folder(), COPY_FILE_NAME)
#df_small = df_source.query("dr_jurisdiction = 'City of Grand Rapids'")
df_small=df_source.query("dr_jurisdiction == 'City of Grand Rapids'").head(5000)
df_small.to_csv( tfn, index=False)

/Users/jameswilfong/Documents/Github/CitizenLabs/00-Data-World/04-add-small-version/data.world/clean-data/adopt-a-drain/grb_drains-2019-04-05.csv
/Users/jameswilfong/Documents/Github/CitizenLabs/00-Data-World/04-add-small-version/data.world/clean-data/adopt-a-drain/grb_drains.csv


# Appendix - Data.World Names

## Keeping the names straight

| CSV Name      | Table Name    | Title          | Dataset ID      | Restful |
| :------------ |:------------- | :------------- | :-------------  | :------------- |
| xxxx_xx       | xxxx_xx       | Xxxx Xx        | xxxx-xx         |    ?     | 
| xxxx_xx       | xxxx_xx       | Xxxx_Xx        | xxxxxx          |    ?            |
| xxxx_xx       | xxxx_xx       | Xxxx-Xx        | xxxx-xx         |    ?         |
| xxxx-xx       | xxxx_xx       | Xxxx Xx        | xxxx-xx         |    ?         |
| xxxx-xx       | xxxx_xx       | Xxxx_Xx        | xxxxxx          |    ?         |
| xxxx-xx       | xxxx_xx       | Xxxx-Xx        | xxxx-xx         |    ?         |

* CSV Name is root of Table name
* Title is root of Dataset ID
* a space in Title will be automatically converted to hyphen in dataset id
* an underscore in Title will be removed in Dataset ID
* a hyphen in CSV Name will be replaced with underscore in Table Name
