In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}


<IPython.core.display.Javascript object>

In [2]:
from lib.p3_ProcessLogger import ProcessLogger
cell_log = ProcessLogger() 

# Import from GitHub
* the final table name comes from the original file name
* input data from Github
* output data to data.world
* 
# Project: Adopt a Drain
## Table of Contents
* [Introduction](#intro)

* [Data Wrangling](#wrangling_steps)


<a id='intro'></a>
## Introduction
* why adopt a drain


<a id='prerequisites'></a>
## Prerequisites
* create [Github](#github) repository to hold raw data
* create [Data World](#data-world) account
* [Notebook Config](#notebook-config)
* [Environment Variable Setup](#env-setup)

<a id='data-world'></a>
## Dataworld
* Set up an account
* DW_AUTH_TOKEN value comes from your [data.world](https://data.world/) account-settings-advanced-Admin.
* Application data is stored in data.world
* A Data.world dataset is mostly read-only
* A Data.world is updated via file replacement


<a id='github'></a>
## Github

* raw-data is loaded from the remote source-data repo on Github
* raw-data is stored in the /raw-data folder of the source-data repo
* raw-data is pushed to the remote source-data repo before running this notebook

<a id='env-setup'></a>
## Environment Variable Setup
* Create a file .env and put in the /notebook folder
* .env does not get included in the github repository. Exclude .env from github in the .gitignore file
* Add environment variables to .env file
    * DW_USER=your-data-world-user-name
    * GH_URL=https://raw.githubusercontent.com/Wilfongjt/source-data/master/raw-data/
    * DW_DB_URL=https://api.data.world/v0/datasets/wilfongjt/
    * DW_DB_RW_TOKEN=dataworld-token
    * DW_ADM_TOKEN=dataworld-adm-token


In [3]:
cell_log.clear()
cell_log.collect('## Load Packages')
# import dotenv
cell_log.collect('* Load environment variables')
from settings import *
cell_log.collect('* Import third party packages')
# from exceptions import ApiException
from datadotworld.client import _swagger
import datadotworld as dw

import numpy as np 
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import csv # read and write csv files
from IPython.display import display, HTML
from IPython.display import Markdown
from pprint import pprint
import time
import os

# convenience functions -- cleaning
cell_log.collect('* Import custom packages')
from lib.p3_CellCounts import CellCounts
import lib.p3_clean as clean
from lib.p3_configuration import get_configuration
import lib.p3_explore as explore
import lib.p3_gather as gather # gathering functions
import lib.p3_helper_functions as helper
import lib.p3_map as maps

Markdown('''{}'''.format(cell_log.getMarkdown()))

settings


## Load Packages
* Load environment variables
* Import third party packages
* Import custom packages

In [4]:
%env

cell_log.clear()
cell_log.collect("<a id='notebook-config'></a>")
cell_log.collect("## Notebook Config")
# ------------ environment variable magic

# Install a pip packages in the current Jupyter kernel
# ------------ Python-dotenv
cell_log.collect("* python-dotenv")
import sys
!{sys.executable} -m pip install python-dotenv
# ------------ data.world API 
cell_log.collect("* datadotworld")
!{sys.executable} -m pip install datadotworld[pandas]

Markdown('''{}'''.format(cell_log.getMarkdown()))

[33mYou are using pip version 9.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 9.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


<a id='notebook-config'></a>
## Notebook Config
* python-dotenv
* datadotworld

# Process
## Prepare Data
* download github repo with data
* put new file in raw-data/
* make copy of this jupyter notebook 
* configure to transform raw-data/ into clean-data/
* put clean data into clean/ folder
* push final changes to github
## Load Data
* 

<a id='wrangling_steps'></a>
# Data Wrangling


<a id='wrangle-process'></a>
## Process

# Download Data

In [5]:
LOCAL_RAW_FOLDER = os.getcwd().replace('notebook','raw-data') + '/'
LOCAL_CLEAN_FOLDER = os.getcwd().replace('notebook','clean-data') + '/'
print('LOCAL_RAW_FOLDER: ', LOCAL_RAW_FOLDER)
print('LOCAL_CLEAN_FOLDER: ', LOCAL_CLEAN_FOLDER)

LOCAL_RAW_FOLDER:  /Users/jameswilfong/Documents/Github/Wilfongjt/source-data/raw-data/
LOCAL_CLEAN_FOLDER:  /Users/jameswilfong/Documents/Github/Wilfongjt/source-data/clean-data/


In [6]:
def getTableDef(table_name, ext='csv'):
    return { "owner_id": DW_USER, 
             "title": table_name, 
             "gh_url": GH_URL + table_name, 
             "visibility": "OPEN", 
             "license": "Public Domain",
             "files": {table_name + '.' + 'csv': {"url": GH_URL + table_name + '.' + ext}},
             "dw_url": DW_DB_URL + table_name + '.' + ext, 
             "local_raw": LOCAL_RAW_FOLDER + table_name + '.' + ext,
             "local_clean": LOCAL_CLEAN_FOLDER + table_name + '.' + ext
           }

def loadDataWorld(tbl_def):
    '''
        Takes a csv file and imports it into dataworld
        tbl_def is { "owner_id": DW_USER, 
                     "title": table_name, 
                     "gh_url": GH_URL + table_name, 
                     "visibility": "OPEN", 
                     "license": "Public Domain",
                     "files": {table_name + '.csv': {"url": GH_URL + table_name + '.csv'}},
                     "dw_url": DW_DB_URL + table_name + '.csv' 
                    }
    '''
    api_client.create_dataset(
        owner_id=d_dic["owner_id"], 
        title=d_dic["title"], 
        visibility=d_dic["visibility"],
        license=d_dic['license'],
        files=d_dic["files"]
    )
# def renameColumns(df,):    
#    df = df.rename(columns=clean_column_names)    

## Configure Process

In [12]:
'''
------------- configure source csv
'''
table_name = 'accidents'
'''
------------- configure source csv
'''
tables = [
    getTableDef(table_name)
]
'''
------------- configure outliers
'''
_outliers = {
  'outliers': [
    {'column':'trk_crnt_x_cord',
     'range':(-90.0, -80.0),
     'reason':'Remove observations too far west or east.'},  
    {'column':'trk_crnt_y_cord',
     'range':(40.0, 50.0),
     'reason':'Remove observations too far north or south.'}
      
  ]
}

In [42]:
cell_log.clear()


cell_log.collect("# CSV Process")
'''
--------------------------------- input
'''
for tbl in tables:
    cell_log.collect("* input:  {}".format( tbl["local_raw"]))

'''
--------------------------------- load data
''' 
tbl = tables[0]
df_source = pd.read_csv(tbl["local_raw"])

'''
--------------------------------- clean column names
'''
cell_log.collect('* clean: Apply a style of lowercase and underscores to column names.')##############################
df_source = clean.clean_column_names(df_source)

'''
--------------------------------- outliers
'''
df_source = clean.remove_obvious_outliers(_outliers, df_source)
# cell_log.collect('# Outliers')
for r in _outliers['outliers']:##############################
    cell_log.collect('* outlier: {}'.format(r['reason']))

'''
--------------------------------- save csv 
'''
# cell_log.collect('# Output')
# assume new file and remove old one
if os.path.isfile(tbl["local_clean"]):
    os.remove(tbl['local_clean'])
    cell_log.collect('* remove: ' + tbl['local_clean'])
cell_log.collect('* output: ' + tbl["local_clean"])
df_source.to_csv(tbl["local_clean"], index=False)

'''
--------------------------------- GIT Process 
'''
cell_log.collect('')
cell_log.collect('# GIT Process')
'''
--------------------------------- input
'''
cell_log.collect('* input: ' + tbl["local_clean"])
'''
--------------------------------- git add
'''
# cell_log.collect('* git add {}'.format(tbl["local_raw"]) )
# cell_log.collect('* XXXXXXXgit add {}'.format(tbl["local_clean"]) )
cell_log.collect('* git add raw-data/ -A' )
cell_log.collect('* XXXXXXX git add clean-data/ -A' )
cell_log.collect('* XXXXXXX git add notebook/ -A' )
'''
--------------------------------- git commit
'''
# cell_log.collect('* XXXXXXX git commit -m "update raw-data {}"'.format(tbl["local_raw"]) )
# cell_log.collect('* XXXXXXX git commit -m "update clean-data {}"'.format("local_clean") )
cell_log.collect('* XXXXXXX git commit -m "update raw-data, clean-data, and notebook files "' )




'''
--------------------------------- git push
'''
cell_log.collect('* XXXXXXX git push origin {}'.format(tbl["local_clean"]) )



'''
--------------------------------- Data World Process 
'''
cell_log.collect('')
cell_log.collect('# Data.World Process')

print('info: ',df_source.info())
print('head: ',df_source.head())



    
Markdown('''{}'''.format(cell_log.getMarkdown()))


* clean_column_names: 0.0005130767822265625 sec
* remove_obvious_outliers: 0.002479076385498047 sec
<class 'pandas.core.frame.DataFrame'>
Int64Index: 848 entries, 0 to 867
Data columns (total 29 columns):
trk_rec_type              848 non-null int64
trk_crsh_id               848 non-null int64
trk_unit_num              848 non-null int64
trk_carr_city             848 non-null object
trk_carr_st               845 non-null object
trk_carr_zip              847 non-null object
trk_carr_name_src_cd      841 non-null object
trk_intr_inta_ind         848 non-null int64
trk_gvwr                  843 non-null float64
trk_cdl_grp_cd            830 non-null float64
trk_cdl_exmp_cd           46 non-null float64
trk_med_card_ind          829 non-null float64
trk_trk_type_cd           829 non-null float64
trk_frst_unit_axle        843 non-null object
trk_scnd_unit_axle        352 non-null object
trk_crgo_body_type_cd     832 non-null float64
trk_hzrd_matl_plrd_ind    841 non-null float64
trk_hzrd_ma

# CSV Process
* input:  /Users/jameswilfong/Documents/Github/Wilfongjt/source-data/raw-data/accidents.csv
* clean: Apply a style of lowercase and underscores to column names.
* outlier: Remove observations too far west or east.
* outlier: Remove observations too far north or south.
* remove: /Users/jameswilfong/Documents/Github/Wilfongjt/source-data/clean-data/accidents.csv
* output: /Users/jameswilfong/Documents/Github/Wilfongjt/source-data/clean-data/accidents.csv

# GIT Process
* input: /Users/jameswilfong/Documents/Github/Wilfongjt/source-data/clean-data/accidents.csv
* git add raw-data/ -A
* git add clean-data/ -A
* git add notebook/ -A
* XXXXXXX git commit -m "update raw-data, clean-data, and notebook files "
* XXXXXXX git push origin /Users/jameswilfong/Documents/Github/Wilfongjt/source-data/clean-data/accidents.csv

# Data.World Process