# Data Downloading

Download the data using openClean

In [1]:
import gzip
import humanfriendly
import os

from openclean.data.source.socrata import Socrata

dataset = Socrata().dataset('ic3t-wcy2')
datafile = './ic3t-wcy2.tsv.gz'

if not os.path.isfile(datafile):
    with gzip.open(datafile, 'wb') as f:
        print('Downloading ...\n')
        dataset.write(f)


fsize = humanfriendly.format_size(os.stat(datafile).st_size)
print("Using '{}' in file {} of size {}".format(dataset.name, datafile, fsize))

Using 'DOB Job Application Filings' in file ./ic3t-wcy2.tsv.gz of size 257.98 MB


# Data Loading

Load the data into pandas and openClean dataset object

In [2]:
import pandas as pd
from openclean.pipeline import stream

df  = pd.read_csv(datafile, dtype='object', sep='\t')
ds = stream(datafile)

# Data Profilling for datetime columns

Find format problems and outliers in all datetime columns
Using openclean's sklearn modules to detect problems and outliers

In [7]:
from openclean.profiling.anomalies.sklearn import DBSCANOutliers

# Print the ten most frequent values for the 'Vehicle Expiration Date' column.
def findDateOutliers(column_name, eps_setting = 0.05):
    datetime_data = ds.distinct(column_name)
    print("Column: ",column_name)
    
    for rank, val in enumerate(datetime_data.most_common(10)):        
        st, freq = val
        print('{:<3} {:>8}  {:>10}'.format('{}.'.format(rank + 1), st, '{:,}'.format(freq)))

    print('\nTotal number of distinct values in {} is {}'.format(column_name, len(datetime_data)))
    print(DBSCANOutliers().find(datetime_data))
    print(DBSCANOutliers(eps = eps_setting).find(datetime_data))
    print('\n==================================')

In [8]:
date_cols = []

print("Datetime Data columns:\n")
for col in ds.columns:
    if 'Date' in col or 'DATE' in col:
        print(col)
        date_cols.append(col)

print("----------------------------\n")        
        
for col in date_cols:
    findDateOutliers(col, 0.02)

Datetime Data columns:

Latest Action Date
Pre- Filing Date
DOBRunDate
SIGNOFF_DATE
SPECIAL_ACTION_DATE
----------------------------

Column:  Latest Action Date
1.  10/13/2017         793
2.  05/18/2017         775
3.  01/25/2017         744
4.  02/12/2018         739
5.  11/30/2016         734
6.  07/11/2016         716
7.  11/29/2016         700
8.  04/30/2018         699
9.  05/04/2017         686
10. 04/20/2016         674

Total number of distinct values in Latest Action Date is 7373
['06//1403']
['09/15/2003', '01/22/2020', '11/30/2016', '02/22/2000', '02/20/2002', '03/16/2019', '02/22/2002', '11/10/2011', '06/18/2003', '05/18/2017', '02/22/2020', '02/27/2020', '02/12/2018', '05/09/2017', '05/04/2017', '02/05/2020', '06//1403', '02/04/2020', '2020-02-20 ', '04/25/2016', '02/02/2020', '01/25/2017', '06/24/2019', '03/26/2019', '02/02/2000', '08/30/2016', '2018-12-10 ', '07/11/2016', '02/20/2020', '04/16/2019', '02/11/2020', '11/29/2016', '02/02/2002', '2019-05-24 ', '2019-05-17 ',

# Analysis

the above results show the problems for the data cleaning task:
    
### Latest Action Date
outliers: '06//1403'
format: 'yyyy-mm-dd' and 'mm/dd/yyyy'

### Pre- Filing Date
no problem found

### DOBRunDate
format: 'yyyy-mm-dd' and 'mm/dd/yyyy 00:00:00'

### SIGNOFF_DATE
outliers: empty value

### SPECIAL_ACTION_DATE
outliers: empty value and '11//2006'

# Data Cleaning for datetime columns

* how to deal with empty values has not decided yet

In [70]:
new_df = df

# Reload the data, only for test
# new_df = pd.read_csv(datafile, dtype='object', sep='\t')

In [71]:
new_df['Latest Action Date'] = new_df['Latest Action Date'].replace('06//1403', '')
new_df['Latest Action Date'] = pd.to_datetime(new_df['Latest Action Date']).dt.strftime('%m/%d/%Y')

In [72]:
new_df['DOBRunDate'] = pd.to_datetime(new_df['DOBRunDate']).dt.strftime('%m/%d/%Y')

In [73]:
new_df['SPECIAL_ACTION_DATE'] = new_df['SPECIAL_ACTION_DATE'].replace('11//2006', '')
new_df['SPECIAL_ACTION_DATE'] = pd.to_datetime(new_df['SPECIAL_ACTION_DATE']).dt.strftime('%m/%d/%Y')