In [229]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}



<IPython.core.display.Javascript object>

# Project: Adopt a Drain Data
 * Author: James Wilfong, wilfongjt@gmail.com

# Goals
* Merge drains from multiple communities across the watershed
* Resolve dataset conflicts i.e., drain identifyer uniqueness, data type, column names, and value range.

## Limits
 * Outputs from this script need to be manually pushed to the data.world repository (repo)
 * Doesn't handle deleted drains
 
## Assumptions
* Assume the community-dataset is a subset of all drains in the universe ;)
* Assume record identifiers are not unique across communities 
* Assume the community-dataset's column names are not the same across communities 
* Assume the community-dataset's data types are not the same across communities
* Assume the community-dataset's data-values ranges are not the same across communities
* Assume the community-dataset has duplicate records
* Assume the community-dataset's format is Comma Seperated Values (CSV) or Excel.

## Inputs
 * The current live dataset is downloaded from data.world 
 * Updates are put into the raw-data/adopt-a-drain of the data.world repo.

## Process
  The process is initiated by running this Jupyter Notebook.
 * load current drains dataset from data.world
 * load dataset(s) from raw-data/adopt-a-drain folder
 * clean data: create unique id from facility id etal.
 * clean data: remove characters from facility ids
 * clean data: map input columns to expected output columns
 * clean data: fix common data problems
 * condense data: remove drains with no facility id
 * condense data: remove unneeded columns
 * condense data: remove outliers
 * condense: merge dataworld and new data 
 * condense data: remove duplicate drains (keep the first duplicate)
 * concat: Make one big dataset from one or many

## Outputs
 * Clean data is output to the clean-data/adopt-a-drain folder.
 * save big dataset to clean-data/adopt-a-drain/grb_drains.csv
 * save big dataset to clean-data/grb_drains-2019-08-020.csv
 * 
 
## Next Steps
 * Push updates to repo
 * Sync data from github to dataworld
     * Sync Manually
     * or Wait for the weekly auto-sync
 * Update the Adopt a Drain database
     * Run Ruby rake process in Heroku 

In [230]:
# %matplotlib notebook
import settings
from IPython.display import display, HTML
from IPython.display import Markdown
from lib.p3_ProcessLogger import ProcessLogger

import helper

In [231]:
#cell_log = ProcessLogger() 
#cell_log.clear()
#import interface
#cell_log.collect('* Import third party packages')
import sys
import time
import numpy as np 
import pandas as pd
import csv # read and write csv files
from pprint import pprint
import os

import datadotworld as dw

# import subprocess

# convenience functions -- cleaning
# cell_log.collect('* Import custom packages')
from lib.p3_CellCounts import CellCounts
# import lib.p3_clean as clean
from lib.p3_configuration import get_configuration
import lib.p3_explore as explore
#import lib.p3_gather as gather # gathering functions
# import lib.p3_helper_functions as helper
import lib.p3_map as maps

# IMPORTANT
## Configuring the Data Transfer
Configure before running "RUN All" in the Cell menu

In [232]:


'''
    Input: CSV file in raw_data/ folder
    Process: clean (conform, condence)
    Output: is directed to the clean-data/ folder
    
    Name the output file using OUTPUT_FILE
    OUTPUT_FILELOCAL_CLEAN_NAME is used to name the data.world table
    Table names should start with letter, may contain letters, numbers, underscores
    
'''
cell_log = ProcessLogger() 
cell_log.clear()
table_name='grb_drains'

metadata = {
    'output_file_name': '{}-{}.csv'.format(table_name, helper.get_daystamp(),'csv'),
    'copy_file_name': '{}.csv'.format(table_name),
    'gh_file_type': 'csv',
    'title': 'GRB Storm Drains',
    'desc': 'Storm Drains of the Grand River Basin, Michigan',
    'table_name': table_name
}

helper.exportMaintainerConfig(metadata['output_file_name'], 
                              metadata['gh_file_type'], 
                              metadata['title'], 
                              metadata['desc'], 
                              metadata['table_name'])    
print('Metadata =============')
pprint(metadata)


kill log:  ./maintainer/maintainer-config.json
{'copy_file_name': 'grb_drains.csv',
 'desc': 'Storm Drains of the Grand River Basin, Michigan',
 'gh_file_type': 'csv',
 'output_file_name': 'grb_drains-2019-08-024.csv',
 'table_name': 'grb_drains',
 'title': 'GRB Storm Drains'}


In [233]:
'''
    Assemble Names of:
        Application,
        Raw data file,
        Clean data file
'''

local_config = { 
                 "app_name": helper.get_app_name(),
                 "local_raw": None,
                 "local_clean": None
               }

'''
    ------------- configure outliers
'''
# _outliers = {
outlier_settings = {
  'outliers': [
    {'column':'dr_lon',
     'range':(-90.0, -80.0),
     'reason':'Remove {} observations too far west or east.',
     'count': 0
    },  
    {'column':'dr_lat',
     'range':(40.0, 50.0),
     'reason':'Remove {} observations too far north or south.',
     'count': 0
    }
  ]
}  
ENV_ERROR=False
print("local_config ===========")
pprint(local_config)
print("outlier_settings ===========")
pprint(outlier_settings)
#print('Local_RAW_FILE: ', LOCAL_RAW_FILE)
#print('LOCAL_CLEAN_FILE: ',LOCAL_CLEAN_FILE )

{'app_name': 'adopt-a-drain', 'local_clean': None, 'local_raw': None}
{'outliers': [{'column': 'dr_lon',
               'count': 0,
               'range': (-90.0, -80.0),
               'reason': 'Remove {} observations too far west or east.'},
              {'column': 'dr_lat',
               'count': 0,
               'range': (40.0, 50.0),
               'reason': 'Remove {} observations too far north or south.'}]}


In [234]:
if ENV_ERROR:
    cell_log.collect("# Script Failure!!")
    cell_log.collect("# !!! Missing Environment Variables !!!")
    cell_log.collect("### see [Environment Variable Setup](#env-setup)")

Markdown('''{}'''.format(cell_log.getMarkdown()))



In [235]:
# common names from imported files and how then map to actual names
maps ={
    "commonNameMap": { 
        "subtype": "dr_subtype",
        "jurisdicti": "dr_jurisdiction",
        "drain__owner": "dr_owner",
        "owner":"dr_owner",
        "local__id": "dr_local_id",
        "facilityid": "dr_facility_id",
        "drain__jurisdiction": "dr_jurisdiction",
        "subwatershed": "dr_subwatershed",
        "subbasin": "dr_subwatershed",
        "point__x":"dr_lon", 
        "long": "dr_lon",
        "point__y":"dr_lat",
        "lat":"dr_lat",
        "soure__id": "del_source_id"
    },

    "region_map": {
        "Kent County Road Commission": "KCRC",
        "KENT COUNTY ROAD COMMISSION":"KCRC",
        "City of East Grand Rapids": "EGR",
        "City of Grandville": "GRANDV",
        "City of Wyoming": "CWY",
        "City of Kentwood": "CK",
        "Grand Rapids Township": "GRTWP",
        "City of Walker": "CW",
        "CGR": "CGR",
        "City of Grand Rapids": "CGR",
        "Georgetown Township": "GTWP",
        "City of Hudsonville": "CHV",
        "Jamestown Township": "JTTWP",
        "Cascade Township": "CASTWP",
        "Algoma Township": "ALGTWP",
        "Grattan Township": "GRATWP",
        "Gaines Township": "GAITWP",
        "Vergennes Township": "VERTWP",
        "Lowell Township": "LOWTWP",
        "Oakfield Township": "OAKTWP",
        "Cannon Township": "CANTWP",
        "Sparta Township": "SPATWP",
        "Solon Township": "SOLTWP",
        "Ada Township": "ADATWP",
        "City of Lowell": "CLO",
        "Bowne Township": "BOWTWP",
        "Tyrone Township": "TYRTWP",
        "Caledonia Township": "CALTWP",
        "Courtland Township": "COUTWP",
        "Spencer Township": "SPETWP",
        "Village of Sparta": "VSP",
        "BYRON TOWNSHIP": "BYRTWP",
        "CALEDONIA TOWNSHIP": "CALETWP",
        "City of Rockford": "CRF",
        "Alpine Township": "ALPTWP",
        "Plainfield Township": "PLATWP",
        "Byron Township": "BYRTWP",
        "OCWRC": "OCWRC",
        "City of Grand Haven DPW":"CGH",
        "Village of Spring Lake DPW": "VSL",
        "Ottawa County Road Commission": "OCRC",
        "OCRC": "OCRC"
    }
}
extraColumns = ['del_source',
              'del_fid',
              'del_gid',
              'source_code',
              'dr_local_id',
              'dr_facility_id',
              'dr_location']
# expected_columns_list = ['facility_prefix', 
expected_process_columns_list = ['facility_prefix', 
                         'dr_subtype', 
                         'dr_jurisdiction', 
                         'dr_owner', 
                         'dr_subwatershed', 
                         'dr_lon', 
                         'dr_lat', 
                         'dr_asset_id', 
                         'dr_type', 
                         'dr_sync_id']


expected_output_columns_list=['dr_subtype',
                              'dr_jurisdiction',
                              'dr_owner',
                              'dr_lat',
                              'dr_lon',
                              'dr_subwatershed',
                              'dr_asset_id', 
                              'dr_type']


'''
def inferName(col_name):
    # select a column name based on previous names found in file
    
    names = { 
        "subtype": "dr_subtype",
        "jurisdicti": "dr_jurisdiction",
        "drain__owner": "dr_owner",
        "owner":"dr_owner",
        "local__id": "dr_local_id",
        "facilityid": "dr_facility_id",
        "drain__jurisdiction": "dr_jurisdiction",
        "subwatershed": "dr_subwatershed",
        "subbasin": "dr_subwatershed",
        "point__x":"dr_lon", 
        "long": "dr_lon",
        "point__y":"dr_lat",
        "lat":"dr_lat",
        "soure__id": "del_source_id"}
    
    if not col_name in names:
        # mark madeup names for easy id later
        return 'del_{}'.format(col_name)
    return names[col_name]

print('owner maps to {}'.format(inferName('owner')))
'''

'\ndef inferName(col_name):\n    # select a column name based on previous names found in file\n    \n    names = { \n        "subtype": "dr_subtype",\n        "jurisdicti": "dr_jurisdiction",\n        "drain__owner": "dr_owner",\n        "owner":"dr_owner",\n        "local__id": "dr_local_id",\n        "facilityid": "dr_facility_id",\n        "drain__jurisdiction": "dr_jurisdiction",\n        "subwatershed": "dr_subwatershed",\n        "subbasin": "dr_subwatershed",\n        "point__x":"dr_lon", \n        "long": "dr_lon",\n        "point__y":"dr_lat",\n        "lat":"dr_lat",\n        "soure__id": "del_source_id"}\n    \n    if not col_name in names:\n        # mark madeup names for easy id later\n        return \'del_{}\'.format(col_name)\n    return names[col_name]\n\nprint(\'owner maps to {}\'.format(inferName(\'owner\')))\n'

In [236]:
class Process():
    
    def getClassName(self):
        return self.__class__.__name__
    
    def process(self):
        raise Exception('Overload process() in {}'.format(self.getClassName())) 
        
    def run(self):
        self.process()
        return self  
    
class Load(Process):
    def __init__(self, import_file_name):
        # import_file_name is  full local file name or url to source
        self.import_file_name=import_file_name
        self.dataframe=None
        
class LoadDrains(Load):
    #def __init__(self, import_file_name):
    #    self.import_file_name=import_file_name
    
    def get_app_name(self):
        '''
        returns application name from script path
        '''
        scripts_path = os.getcwd()
        rc = ''
        pth = scripts_path.split('/')
        rc = pth[len(pth)-1]
        return rc

    def get_repo_folder(self):
        '''
        returns path to the repo folder from script path
        '''
        scripts_path = os.getcwd()
        rc = ''
        rc = scripts_path.replace('/' + self.get_app_name(), '').replace('/scripts','')
        return rc
    
    def get_raw_data_folder(self):
        '''
        returns path to raw data from script path
        '''
        scripts_path = os.getcwd()
        return self.get_repo_folder() + '/raw-data/' + self.get_app_name()
    
    def filename(self, in_f):
        ps = in_f.split('/')
        return ps[len(ps)-1]
    
    def get_dataframe(self):
        return self.dataframe
    
    def process(self):
        print('* Load Drains', self.filename(self.import_file_name))
        # print(' - ', self.filename(self.import_file_name))

        '''
        import_file_name is the full path and name of import file
        returns the original raw data as pandas dataframe
        '''
        self.dataframe = pd.read_csv(self.import_file_name)
    

class LoadDataWorld(Load):
    '''
    creates a dataframe with a fresh copy of the data.world dataset 
    dont forget to run
    '''
    
    def get_dataframe(self):
        return self.dataframe
    
    def process(self):
        print('* Load Data.World')
        '''
        import_file_name is the full path and name of import file
        returns the original raw data as pandas dataframe
        '''
        # download to ~/.dw/cache/{}/latest/data/grb_drains.csv
        self.dataframe = dw.load_dataset(self.import_file_name, auto_update=True)
        fstr = '~/.dw/cache/{}/latest/data/grb_drains.csv'.format('citizenlabs/grb-storm-drains-2019-04-03')
        # 
        self.dataframe = pd.read_csv(fstr)
        
#test_import_file_name = '/Users/jameswilfong/Documents/Github/Wilfongjt/01-AAD-data-world/01-In-Progress/00-03-load-spring-lake/data.world/raw-data/adopt-a-drain/CatchBasins_7_17_2019.xls.csv'
# assert Load(testfile).get_app_name() == 'adopt-a-drain'
#print(Load().get_repo_folder())
#print(Load().get_raw_data_folder())
#print(helper.get_raw_files('csv'))
# assert Load().get_repo_folder() == 'adopt-a-drain'
#
#df=LoadCSV(test_import_file_name).run().get_dataframe()
#df.info
#df.head


#df=LoadDW('citizenlabs/grb-storm-drains-2019-04-03').run().get_dataframe()

#print(df)
#df.info
#df.head


In [237]:

class Condense(Process):
    def __init__(self, dataframe, expected_output_columns_list, extraColumns, outlier_settings):
        self.dataframe = dataframe
        self.expected_output_columns_list=expected_output_columns_list
        self.extraColumns = extraColumns
        self.outlier_settings = outlier_settings
        
    def validateColumns(self):  
        print(' - validate columns')
        '''
        check 's column name for the expected colnames
        '''
        for nm in self.expected_output_columns_list:
            if not nm in self.get_dataframe().columns.values:
                raise Exception('{} is missing from '.format(get_dataframe().format(nm)) )
                
    def removeExtraColumns(self):
        print(' - Remove extra columns')
        for colname in self.extraColumns:
            if( colname in self.get_dataframe().columns.values):
                print(' -- drop column: ',colname)
                self.set_dataframe(self.get_dataframe().drop([colname], axis=1))
                
    def remove_obvious_outliers(self):
        print(' - remove outliers')
        '''
        remove individual observations
        remove range of observation
        _outliers is 
        {
          'outliers': [
            {'column':'scheduled_day',
             'range':(pd.to_datetime('2016-01-01'), pd.to_datetime('2017-01-01')),
             'reason':'Remove 2015. Appointment in 2015 has many gaps in the timeline numbers'},
            {'column': 'scheduled_day_of_week',
             'range': (0,4) ,
             'reason':'Remove Saturday and Sunday visits. These are so few that they could easily .'},
            {'column':'lon',
             'range':(-50.0,-35.0),
             'reason':'Remove neighbourhoods that have bad longitudes (too far east).'},
            {'column':'scheduled_hour',
             'range':(7,20),
             'reason':'Remove small number of observations at 6:00 and 21:00 hours.'}
          ]
        }

        '''
        
        for outlier in self.outlier_settings['outliers']:
            # pprint(outlier)
            col_name = outlier['column']

            if 'range' in outlier:

                low = outlier['range'][0]
                high = outlier['range'][1]
                sz = len(self.get_dataframe())

                tmp = None
                tmp1 = ''

                if isinstance(low, np.datetime64):
                    self.set_dataframe(
                      self.get_dataframe()[(self.get_dataframe()[col_name].to_datetime() >= low) & (self.get_dataframe()[col_name].to_datetime() <= high)]
                    )
                else:   
                    self.set_dataframe(
                        self.get_dataframe()[(self.get_dataframe()[col_name] >= low) & (self.get_dataframe()[col_name] <= high)]
                    )
                outlier["count"] = sz - len(self.get_dataframe())

            elif 'categories' in outlier:
                _list = outlier['categories']
                sz = len(self.get_dataframe())
                self.set_dataframe(
                    self.get_dataframe()[self.get_dataframe()[col_name].isin(_list)]
                )
                outlier["count"] = sz - len(self.get_dataframe())
            if "reason" in outlier:
                outlier["reason"] = outlier["reason"].format(  str(outlier["count"]) )


    def remove_duplicates(self):
        print(' - drop duplicates')
        self.set_dataframe(self.get_dataframe().drop_duplicates('dr_asset_id',keep='first'))
    
    def get_dataframe(self):
        return self.dataframe
    
    def set_dataframe(self, dataframe):
        self.dataframe = dataframe    
    
    def process(self):
        print('* Condense')
        self.removeExtraColumns()
        self.validateColumns()
        self.remove_obvious_outliers()
        self.remove_duplicates()
        #self.set_dataframe()
         

In [238]:
class Clean(Process):
    def __init__(self, df_source, commonNameMap, region_map):
        self.dataframe = df_source
        self.commonNameMap = commonNameMap
        self.region_map=region_map
        
class CleanDrains(Clean):
    
    def clean_column_names(self):
        '''
        convert each column to lowercase with underscore seperation

        e.g., ID to id
        e.g., County ID to county_id
        e.g., County-ID to county_id
        :param actual_col_list: list of column names
        :return: clean list of column names

        {
          'field-name': {}
        }

        '''
        # start_time = time.time()
        
        actual_col_list = self.dataframe.columns
        clean_column_names = {}
        for cn in actual_col_list:
        
            ncn = cn
            # get rid of some unwanted characters

            if ' ' in cn:
                ncn = cn.replace(' ','_')

            if '-' in cn:
                ncn = cn.replace('-', '_')

            # force first char to lower case
            nncn = ncn
            ncn = ''
            prev_upper = True #False
            case = False
            camelcase = False
            for c in nncn:
                if c in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
                    case = True
                    if prev_upper:
                        ncn += c.lower()
                    else:
                        ncn += '_' + c.lower()
                        camelcase = True
                    prev_upper = True
                else:
                    ncn += c
                    prev_upper = False

            clean_column_names[cn]=ncn

        return self.dataframe.rename(columns=clean_column_names)
        # print('* clean_column_names: {} sec'.format(time.time() - start_time))  # time_taken is in seconds
    
    
    def inferName(self, col_name):
        '''
        select a column name based on previous names found in file
        '''
        names = { 
            "subtype": "dr_subtype",
            "jurisdicti": "dr_jurisdiction",
            "drain__owner": "dr_owner",
            "owner":"dr_owner",
            "local__id": "dr_local_id",
            "facilityid": "dr_facility_id",
            "drain__jurisdiction": "dr_jurisdiction",
            "subwatershed": "dr_subwatershed",
            "subbasin": "dr_subwatershed",
            "point__x":"dr_lon", 
            "long": "dr_lon",
            "point__y":"dr_lat",
            "lat":"dr_lat",
            "soure__id": "del_source_id"}

        if not col_name in self.commonNameMap:
            # mark madeup names for easy id later
            return 'del_{}'.format(col_name)
        
        return self.commonNameMap[col_name]

    def getColumnDict(self):

        col_dict = {}
        for nm in self.dataframe.columns.values:
            col_dict[nm]=self.inferName(nm)  
        return col_dict
    
    def remove_char(self,columnList):
        '''
        some facillity ids have characters mixed wtih number
        we need just the number part
        this function removes all characters from the facility id
        '''
        newList = []

        for item in columnList:
            fi = ''
            for ch in str(item):
                if ch in '0123456789':
                    fi += ch
                else:
                    fi += '0'
            newList.append(fi)

        return newList
        
    def regional_codes(self, df_source , _owner):
        '''
        regional codes identify the data's source community
        code are added over time. this method checks and throws error not found.
        fix by adding new owner and code to list below
        '''
        #print('regional_code 1')
        rc = []

        # look at data in in the _owner column
        for jur in self.dataframe[_owner]:
            # check if jur is in the codes
            if jur in self.region_map:
                rc.append(self.region_map[jur])
            else:
                print('bad name', )
                raise Exception('Regional-Code for ({}) is not available... add new '.format(jur)) 
                #rc.append('XXX')

        return rc    
    
    def get_dataframe(self):
        return self.dataframe
    
    def process(self):
        '''
        clean up the df_source
        '''
        self.dataframe = self.clean_column_names()
        self.dataframe = self.dataframe.rename(columns=self.getColumnDict())
        
        # patch up bad owner and jurisdiction names
        self.dataframe['dr_jurisdiction'] = self.dataframe['dr_owner'] # is what it is
        
        # mark all empties with nan
        self.dataframe['dr_facility_id'] = self.dataframe['dr_facility_id'].apply(lambda x:  np.nan if x != x or x == '' or x == ' ' or x == None else x)
        
        # some dr_facilities have alfa numeric values ... clean up
        self.dataframe['dr_facility_id'] = self.remove_char(self.dataframe['dr_facility_id'])
        
        # add colunm to id the source of data records
        self.dataframe['source_code'] = self.regional_codes( self.dataframe , 'dr_owner')
        
        # convert typ to integer
        self.dataframe['dr_facility_id'] = self.dataframe['dr_facility_id'].astype('int64')
        
        # create final id aka dr_asset_id
        self.dataframe['dr_asset_id'] = self.dataframe['source_code'] + '_'+ self.dataframe['dr_facility_id'].astype(str)

        self.dataframe['dr_type'] = self.dataframe['dr_asset_id'].apply(lambda x: 'Storm Water Inlet Drain')
        

In [239]:
class Wrangle(Process):
    def __init__(self, maps):
        self.maps=maps
        
    def get_dataframe(self):
        return self.dataframe
    
    def set_dataframe(self, dataframe):
        self.dataframe = dataframe
    
    def xls2csv(self,xls_name):
        import xlrd
        # generates a csv file from the first sheet in an excel file

        wb = xlrd.open_workbook(xls_name)
        sh = wb.sheet_by_index(0)
        your_csv_file = open('{}.csv'.format(xls_name), 'w', encoding='utf8')

        wr = csv.writer(your_csv_file, quoting=csv.QUOTE_ALL)
        for rownum in range(sh.nrows):
            wr.writerow(sh.row_values(rownum))

        your_csv_file.close()

    def validateOutputColumns(self):

        #expected_lst = ['dr_subtype', 'dr_jurisdiction', 'dr_owner', 'dr_local_id', 'dr_facility_id', 'dr_subwatershed', 'dr_lon', 'dr_lat', 'dr_asset_id', 'dr_type', 'dr_sync_id']
        expected_lst = ['facility_prefix', 'dr_subtype', 'dr_jurisdiction', 'dr_owner', 'dr_subwatershed', 'dr_lon', 'dr_lat', 'dr_asset_id', 'dr_type', 'dr_sync_id']

        for nm in get_dataframe().columns.values:
            if not nm in expected_lst:   
                raise Exception('{} is unexpected output for clean data'.format(nm))


    def filename(self, in_f):
        ps = in_f.split('/')
        return ps[len(ps)-1]
        
    def conversions(self):
        for xls in helper.get_raw_files('xls'):
            print(' -- convert ', xls)
            self.xls2csv(xls)
            
    def finalCSV(self):
        self.get_dataframe().to_csv(local_config["local_clean"], index=False)
        
    def process(self):
        print('* Wrangle')
        # get list of raw data files
        print(' - raw folder ', helper.get_raw_data_folder())
        # print(helper.get_raw_files('csv'))
        raw_folder = helper.get_raw_data_folder()
        clean_folder = helper.get_clean_data_folder()

        concat_list = []
        #* load data
        #* convert xls to csv
        #* fix column names
        #* map expected colums to raw-data columns
        #* drop drains without a facility id
        #* fix column types

        self.conversions() # convert excel files to csv

        # load these up first
        concat_list.append(LoadDataWorld(dw_source).run().get_dataframe())

        # load up the files in the raw data folder
        for in_f in helper.get_raw_files('csv'):
            # print(' - raw: ', self.filename(in_f))
            
            # LOAD
            
            self.set_dataframe( LoadDrains(in_f)\
                .run()\
                .get_dataframe())
            
            # CLEAN
            
            self.set_dataframe( CleanDrains(self.get_dataframe(), 
                                            self.maps['commonNameMap'], 
                                            self.maps['region_map']).run().get_dataframe() )
            
            # COMPILE
            
            concat_list.append(self.get_dataframe())
                               
        '''
        --------------------------------- combine datasets
        '''      
        self.set_dataframe( pd.concat(concat_list) )

        '''
        --------------------------------- Condense dataset (cols, rows)
        '''
        self.set_dataframe( Condense(self.get_dataframe(),\
                                     expected_output_columns_list,\
                                     extraColumns,\
                                     outlier_settings).run().get_dataframe() )

        '''
        --------------------------------- save csv 
        '''
        # assume new file and remove old one
        local_config["local_clean"]='{}/{}'.format(helper.get_clean_data_folder(),metadata['output_file_name'])

        if os.path.isfile(local_config["local_clean"]):
            os.remove(local_config['local_clean'])
            cell_log.collect('* deleted {} '.format(local_config['local_clean']))

        # stop if columns are not expected
        #self.validateOutputColumns(self.get_dataframe())
        self.finalCSV()
        #df_source.to_csv(local_config["local_clean"], index=False)

## Wrangling Script

In [240]:
# NEW CELL
# testing 
# current dataset from dataworld
dw_source = 'citizenlabs/grb-storm-drains-2019-04-03'
wrangle=Wrangle(maps)
cell_log.clear()
if ENV_ERROR:
    cell_log.collect("# Script Failure!!")
    cell_log.collect("# !!! Missing Environment Variables !!!")
    cell_log.collect("### see [Environment Variable Setup](#env-setup)")
else:
    # get list of raw data files
    wrangle.run()
    

* Wrangle
 - raw folder  /Users/jameswilfong/Documents/Github/Wilfongjt/01-AAD-data-world/01-In-Progress/04-notebook-grb-drains-validation/data.world/raw-data/adopt-a-drain
 -- convert  /Users/jameswilfong/Documents/Github/Wilfongjt/01-AAD-data-world/01-In-Progress/04-notebook-grb-drains-validation/data.world/raw-data/adopt-a-drain/CatchBasins_7_17_2019.xls
* Load Data.World
* Load Drains CatchBasins_7_17_2019.xls.csv
* Condense
 - Remove extra columns
 -- drop column:  del_source
 -- drop column:  del_fid
 -- drop column:  source_code
 -- drop column:  dr_local_id
 -- drop column:  dr_facility_id
 -- drop column:  dr_location
 - validate columns
 - remove outliers
 - drop duplicates


In [241]:
'''
# NEW CELL
# testing 
# current dataset from dataworld
dw_source = 'citizenlabs/grb-storm-drains-2019-04-03'

cell_log.clear()
if ENV_ERROR:
    cell_log.collect("# Script Failure!!")
    cell_log.collect("# !!! Missing Environment Variables !!!")
    cell_log.collect("### see [Environment Variable Setup](#env-setup)")
else:
    # get list of raw data files
    print('raw folder ', helper.get_raw_data_folder())
    # print(helper.get_raw_files('csv'))
    raw_folder = helper.get_raw_data_folder()
    clean_folder = helper.get_clean_data_folder()
    
    concat_list = []
    #* load data
    #* convert xls to csv
    #* fix column names
    #* map expected colums to raw-data columns
    #* drop drains without a facility id
    #* fix column types
    
    for xls in helper.get_raw_files('xls'):
        print('--------')
        print(xls)
        xls2csv(xls)
    

    # load these up first
    concat_list.append(LoadDataWorld(dw_source).run().get_dataframe())
    
    # load up the files in the raw data folder
    for in_f in helper.get_raw_files('csv'):
        print('-----------------------------------')
        print('raw: ', in_f)
        print('-----------------------------------')
        ##################
        # LOAD
        ######
        df_source = LoadDrains(in_f).run().get_dataframe()
        ##################
        # CLEAN
        ######
        df_source = CleanDrains(df_source, commonNameMap, region_map)\
            .run()\
            .get_dataframe()
        
        
        #--------------------------------- Compile a list of cleaned datasets
        
        concat_list.append(df_source)
    print('done loading files')
    #removeList, 
    
    cell_log.collect("")
    cell_log.collect("# Combined")
    
    #--------------------------------- combine datasets
          
    df_source = pd.concat(concat_list)
    
    
    #--------------------------------- Condense dataset (cols, rows)
    
    scnt = len(df_source)
    df_source = Condense(df_source, expected_output_columns_list, extraColumns, outlier_settings).process()
    ecnt = len(df_source)
    cell_log.collect('* duplicates: dropped {} duplicate asset ids'.format(scnt - ecnt))
   

    
    #--------------------------------- save csv 
    
    # assume new file and remove old one
    local_config["local_clean"]='{}/{}'.format(helper.get_clean_data_folder(),metadata['output_file_name'])

    if os.path.isfile(local_config["local_clean"]):
        os.remove(local_config['local_clean'])
        cell_log.collect('* deleted {} '.format(local_config['local_clean']))

    cell_log.collect("* inter-output: columns {}".format(df_source.columns.values))
    cell_log.collect('* inter-output: {} obs to {}'.format(len(df_source) , local_config["local_clean"]))

    # stop if columns are not expected
    validateOutputColumns(df_source)

    df_source.to_csv(local_config["local_clean"], index=False)
'''
#Markdown('''{}'''.format(cell_log.getMarkdown()))

'\n# NEW CELL\n# testing \n# current dataset from dataworld\ndw_source = \'citizenlabs/grb-storm-drains-2019-04-03\'\n\ncell_log.clear()\nif ENV_ERROR:\n    cell_log.collect("# Script Failure!!")\n    cell_log.collect("# !!! Missing Environment Variables !!!")\n    cell_log.collect("### see [Environment Variable Setup](#env-setup)")\nelse:\n    # get list of raw data files\n    print(\'raw folder \', helper.get_raw_data_folder())\n    # print(helper.get_raw_files(\'csv\'))\n    raw_folder = helper.get_raw_data_folder()\n    clean_folder = helper.get_clean_data_folder()\n    \n    concat_list = []\n    #* load data\n    #* convert xls to csv\n    #* fix column names\n    #* map expected colums to raw-data columns\n    #* drop drains without a facility id\n    #* fix column types\n    \n    for xls in helper.get_raw_files(\'xls\'):\n        print(\'--------\')\n        print(xls)\n        xls2csv(xls)\n    \n\n    # load these up first\n    concat_list.append(LoadDataWorld(dw_source).run

# Output new CSV File
* replacement for data.world and the production db
* the small version for the test db

In [242]:
from shutil import copyfile
# OUTPUT_FILE_NAME
#/Users/jameswilfong/Documents/Github/CitizenLabs/00-Data-World/03-april-data/data.world/clean-data/adopt-a-drain/grb_drains-2019-04-02.csv

# ifn = '{}/{}'.format(helper.get_clean_data_folder(), OUTPUT_FILE_NAME)


ifn = '{}/{}'.format(helper.get_clean_data_folder(), metadata['output_file_name'])
ofn = '{}/{}'.format(helper.get_clean_data_folder(), metadata['copy_file_name'])

print(ifn)
print(ofn)

copyfile(ifn, ofn)
# set up a smaller version of file
tfn = '{}/{}'.format(helper.get_test_version_folder(), metadata['copy_file_name'])
#df_small = df_source.query("dr_jurisdiction = 'City of Grand Rapids'")
df_small=wrangle.get_dataframe().query("dr_jurisdiction == 'City of Grand Rapids'").head(5000)
df_small.to_csv( tfn, index=False)

/Users/jameswilfong/Documents/Github/Wilfongjt/01-AAD-data-world/01-In-Progress/04-notebook-grb-drains-validation/data.world/clean-data/adopt-a-drain/grb_drains-2019-08-024.csv
/Users/jameswilfong/Documents/Github/Wilfongjt/01-AAD-data-world/01-In-Progress/04-notebook-grb-drains-validation/data.world/clean-data/adopt-a-drain/grb_drains.csv


# Appendix - Data.World Names

## Keeping the names straight

| CSV Name      | Table Name    | Title          | Dataset ID      | Restful |
| :------------ |:------------- | :------------- | :-------------  | :------------- |
| xxxx_xx       | xxxx_xx       | Xxxx Xx        | xxxx-xx         |    ?     | 
| xxxx_xx       | xxxx_xx       | Xxxx_Xx        | xxxxxx          |    ?            |
| xxxx_xx       | xxxx_xx       | Xxxx-Xx        | xxxx-xx         |    ?         |
| xxxx-xx       | xxxx_xx       | Xxxx Xx        | xxxx-xx         |    ?         |
| xxxx-xx       | xxxx_xx       | Xxxx_Xx        | xxxxxx          |    ?         |
| xxxx-xx       | xxxx_xx       | Xxxx-Xx        | xxxx-xx         |    ?         |

* CSV Name is root of Table name
* Title is root of Dataset ID
* a space in Title will be automatically converted to hyphen in dataset id
* an underscore in Title will be removed in Dataset ID
* a hyphen in CSV Name will be replaced with underscore in Table Name
