In [6]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}



<IPython.core.display.Javascript object>

# Project: Adopt a Drain Data
 * Author: James Wilfong, wilfongjt@gmail.com

# Goals
* Check data.world drains for duplicates?
* 

## Limits
 * does not update the file
 
## Assumptions
* Assume 


## Inputs
 * The current live dataset is downloaded from data.world 


## Process
  The process is initiated by running this Jupyter Notebook.
 * load current drains dataset from data.world
 * look at field dr_asset_id for duplicates

## Outputs
 * none
 
 
## Next Steps
 

In [7]:
# %matplotlib notebook
import settings
from IPython.display import display, HTML
from IPython.display import Markdown
from lib.p3_ProcessLogger import ProcessLogger

import helper

In [8]:

import sys
import time
import numpy as np 
import pandas as pd
import csv # read and write csv files
from pprint import pprint
import os

import datadotworld as dw

# import subprocess

# convenience functions -- cleaning
# cell_log.collect('* Import custom packages')
#from lib.p3_CellCounts import CellCounts
# import lib.p3_clean as clean
#from lib.p3_configuration import get_configuration
#import lib.p3_explore as explore
#import lib.p3_gather as gather # gathering functions
# import lib.p3_helper_functions as helper
#import lib.p3_map as maps

# IMPORTANT
## Configuring the Data Transfer
Configure before running "RUN All" in the Cell menu

In [9]:
'''
    Input: CSV file in raw_data/ folder
    Process: clean (conform, condence)
    Output: is directed to the clean-data/ folder
    
    Name the output file using OUTPUT_FILE
    OUTPUT_FILELOCAL_CLEAN_NAME is used to name the data.world table
    Table names should start with letter, may contain letters, numbers, underscores
    
'''
cell_log = ProcessLogger() 
cell_log.clear()
table_name='grb_drains'

metadata = {
    'output_file_name': '{}-{}.csv'.format(table_name, helper.get_daystamp(),'csv'),
    'copy_file_name': '{}.csv'.format(table_name),
    'gh_file_type': 'csv',
    'title': 'GRB Storm Drains',
    'desc': 'Storm Drains of the Grand River Basin, Michigan',
    'table_name': table_name
}
'''
helper.exportMaintainerConfig(metadata['output_file_name'], 
                              metadata['gh_file_type'], 
                              metadata['title'], 
                              metadata['desc'], 
                              metadata['table_name'])    
'''
print('Metadata =============')
pprint(metadata)


{'copy_file_name': 'grb_drains.csv',
 'desc': 'Storm Drains of the Grand River Basin, Michigan',
 'gh_file_type': 'csv',
 'output_file_name': 'grb_drains-2019-08-024.csv',
 'table_name': 'grb_drains',
 'title': 'GRB Storm Drains'}


In [10]:
'''
    Assemble Names of:
        Application,
        Raw data file,
        Clean data file
'''


'''
    ------------- configure outliers
'''

ENV_ERROR=False
#print("local_config ===========")
#pprint(local_config)
#print("outlier_settings ===========")
#pprint(outlier_settings)
#print('Local_RAW_FILE: ', LOCAL_RAW_FILE)
#print('LOCAL_CLEAN_FILE: ',LOCAL_CLEAN_FILE )

In [11]:
if ENV_ERROR:
    cell_log.collect("# Script Failure!!")
    cell_log.collect("# !!! Missing Environment Variables !!!")
    cell_log.collect("### see [Environment Variable Setup](#env-setup)")

Markdown('''{}'''.format(cell_log.getMarkdown()))



In [37]:
class Process():
    
    def getClassName(self):
        return self.__class__.__name__
    
    def process(self):
        raise Exception('Overload process() in {}'.format(self.getClassName())) 
        
    def run(self):
        self.process()
        return self  
    
class Load(Process):
    def __init__(self, import_file_name):
        # import_file_name is  full local file name or url to source
        self.import_file_name=import_file_name
        self.dataframe=None
        
class LoadDrains(Load):
    #def __init__(self, import_file_name):
    #    self.import_file_name=import_file_name
    
    def get_app_name(self):
        '''
        returns application name from script path
        '''
        scripts_path = os.getcwd()
        rc = ''
        pth = scripts_path.split('/')
        rc = pth[len(pth)-1]
        return rc

    def get_repo_folder(self):
        '''
        returns path to the repo folder from script path
        '''
        scripts_path = os.getcwd()
        rc = ''
        rc = scripts_path.replace('/' + self.get_app_name(), '').replace('/scripts','')
        return rc
    
    def get_raw_data_folder(self):
        '''
        returns path to raw data from script path
        '''
        scripts_path = os.getcwd()
        return self.get_repo_folder() + '/raw-data/' + self.get_app_name()
    
    def get_dataframe(self):
        return self.dataframe
    
    def process(self):
        '''
        import_file_name is the full path and name of import file
        returns the original raw data as pandas dataframe
        '''
        self.dataframe = pd.read_csv(self.import_file_name)
    

class LoadDataWorld(Load):
    '''
    creates a dataframe with a fresh copy of the data.world dataset 
    dont forget to run
    '''
    
    def get_dataframe(self):
        return self.dataframe
    
    
    
    def process(self):
        '''
        import_file_name is the full path and name of import file
        returns the original raw data as pandas dataframe
        '''
        # download to ~/.dw/cache/{}/latest/data/grb_drains.csv
        self.dataframe = dw.load_dataset(self.import_file_name, auto_update=True)
        fstr = '~/.dw/cache/{}/latest/data/grb_drains.csv'.format('citizenlabs/grb-storm-drains-2019-04-03')
        # 
        self.dataframe = pd.read_csv(fstr)
        self.dataframe.info()
        

class UniqueList(Process):
    def __init__(self, _list):
        self.list = _list
        
    def process(self):
        dic = {}
        
        for n in self.list:
            if n not in dic:
                dic[n]=1
            else:
                dic[n]+=1
                
        for n in dic:
            if dic[n] > 1:
                print(n, dic[n])
    
    

## Wrangling Script

In [38]:
# NEW CELL
# testing 
# current dataset from dataworld
dw_file = 'citizenlabs/grb-storm-drains-2019-04-03'
in_f = '~/.dw/cache/citizenlabs/grb-storm-drains-2019-04-03/latest/data/grb_drains.csv'
cell_log.clear()
if ENV_ERROR:
    cell_log.collect("# Script Failure!!")
    cell_log.collect("# !!! Missing Environment Variables !!!")
    cell_log.collect("### see [Environment Variable Setup](#env-setup)")
else:


    #* load data
    ##################
    # LOAD
    ######
    df_source = LoadDataWorld(dw_file).run().get_dataframe()
    
    UniqueList(df_source['dr_asset_id']).run()
    ##################
    # check dups
    ######
    #df_source.info()
    #df_source.head()
    #f_source.shape()
    #df_source.size()


#Markdown('''{}'''.format(cell_log.getMarkdown()))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45527 entries, 0 to 45526
Data columns (total 9 columns):
dr_asset_id        45527 non-null object
dr_jurisdiction    45527 non-null object
dr_lat             45527 non-null float64
dr_lon             45527 non-null float64
dr_owner           45527 non-null object
dr_subtype         45527 non-null float64
dr_subwatershed    45405 non-null object
dr_type            45527 non-null object
dr_location        45527 non-null object
dtypes: float64(3), object(6)
memory usage: 3.1+ MB
GRANDV_40159311 2
GRANDV_40159299 2


# Appendix - Data.World Names