In [1]:
file_name = 'nyc_parking_tickets_extract.csv'

In [4]:
with open(file_name) as f:
    for _ in range(10):
        print(next(f))

Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Violation Description

4006478550,VAD7274,VA,PAS,10/5/2016,5,4D,BMW,BUS LANE VIOLATION

4006462396,22834JK,NY,COM,9/30/2016,5,VAN,CHEVR,BUS LANE VIOLATION

4007117810,21791MG,NY,COM,4/10/2017,5,VAN,DODGE,BUS LANE VIOLATION

4006265037,FZX9232,NY,PAS,8/23/2016,5,SUBN,FORD,BUS LANE VIOLATION

4006535600,N203399C,NY,OMT,10/19/2016,5,SUBN,FORD,BUS LANE VIOLATION

4007156700,92163MG,NY,COM,4/13/2017,5,VAN,FRUEH,BUS LANE VIOLATION

4006687989,MIQ600,SC,PAS,11/21/2016,5,VN,HONDA,BUS LANE VIOLATION

4006943052,2AE3984,MD,PAS,2/1/2017,5,SW,LINCO,BUS LANE VIOLATION

4007306795,HLG4926,NY,PAS,5/30/2017,5,SUBN,TOYOT,BUS LANE VIOLATION



In [10]:
with open(file_name) as f:
    column_headers = next(f).strip('\n').split(',')
    sample_data = next(f).strip('\n').split(',')

In [12]:
column_headers

['Summons Number',
 'Plate ID',
 'Registration State',
 'Plate Type',
 'Issue Date',
 'Violation Code',
 'Vehicle Body Type',
 'Vehicle Make',
 'Violation Description']

In [14]:
sample_data

['4006478550',
 'VAD7274',
 'VA',
 'PAS',
 '10/5/2016',
 '5',
 '4D',
 'BMW',
 'BUS LANE VIOLATION']

In [16]:
column_names = [header.replace(' ','_').lower()
                for header in column_headers]

In [18]:
column_names

['summons_number',
 'plate_id',
 'registration_state',
 'plate_type',
 'issue_date',
 'violation_code',
 'vehicle_body_type',
 'vehicle_make',
 'violation_description']

In [22]:
list(zip(column_names,sample_data))

[('summons_number', '4006478550'),
 ('plate_id', 'VAD7274'),
 ('registration_state', 'VA'),
 ('plate_type', 'PAS'),
 ('issue_date', '10/5/2016'),
 ('violation_code', '5'),
 ('vehicle_body_type', '4D'),
 ('vehicle_make', 'BMW'),
 ('violation_description', 'BUS LANE VIOLATION')]

'summons_number',  --int
'plate_id', --str
'registration_state',--str
'plate_type',--str
'issue_date',--date
'violation_code',--int
'vehicle_body_type',--str
'vehicle_make', --str
'violation_description' --str

In [23]:
from collections import namedtuple

In [126]:
Ticket = namedtuple('Ticket',column_names)

In [127]:
with open(file_name) as f:
    next(f)
    raw_data_row = next(f)

In [102]:
raw_data_row

'4006478550,VAD7274,VA,PAS,10/5/2016,5,4D,BMW,BUS LANE VIOLATION\n'

In [103]:
def read_data():
    with open(file_name) as f:
        next(f)
        yield from f

In [104]:
raw_data = read_data()
for _ in range(5):
    print(next(raw_data))

4006478550,VAD7274,VA,PAS,10/5/2016,5,4D,BMW,BUS LANE VIOLATION

4006462396,22834JK,NY,COM,9/30/2016,5,VAN,CHEVR,BUS LANE VIOLATION

4007117810,21791MG,NY,COM,4/10/2017,5,VAN,DODGE,BUS LANE VIOLATION

4006265037,FZX9232,NY,PAS,8/23/2016,5,SUBN,FORD,BUS LANE VIOLATION

4006535600,N203399C,NY,OMT,10/19/2016,5,SUBN,FORD,BUS LANE VIOLATION



In [105]:
def parse_int(value,*,default = None):
    try:
        return int(value)
    except ValueError:
        return default

In [106]:
parse_int('test',default='not an integer')

'not an integer'

In [107]:
parse_int(10,default = 'not an integer')

10

In [108]:
from datetime import datetime


def parse_date(value,*,default = None):
    date_format = '%m/%d/%Y'
    try:
        return datetime.strptime(value,date_format).date()
    except ValueError:
        return default

In [109]:
parse_int('hello',default ='N/A')

'N/A'

In [110]:
parse_date('3/28/2020')

datetime.date(2020, 3, 28)

In [111]:
parse_date('234234',default='N/A')

'N/A'

In [112]:
def parse_string(value,*,default=None):
    try:
        cleaned = value.strip()
        if not cleaned:
            return default
        else:
            return cleaned
        
    except ValueError:
        return default

In [113]:
parse_string('   hello    ')

'hello'

In [114]:
parse_string('     ',default = 'N/A')

'N/A'

In [115]:
column_names

['summons_number',
 'plate_id',
 'registration_state',
 'plate_type',
 'issue_date',
 'violation_code',
 'vehicle_body_type',
 'vehicle_make',
 'violation_description']

In [82]:
from functools import partial

column_parsers = (parse_int,
                 parse_string,
                 lambda x : parse_string(x,default=''),
                 partial(parse_string,default =''),
                 parse_date,
                 parse_int,
                 partial(parse_string,default=''),
                 parse_string,
                 lambda x :parse_string(x,default='')
                 )

In [117]:
def parse_row(row):
    fields = row.strip('\n').split(',')
    parsed_data = (func(field) for func, field in zip(column_parsers,fields))
    return parsed_data

In [118]:
rows = read_data()
for _ in range(5):
    row = next(rows)
    parsed_data = parse_row(row)
    print(list(parsed_data))

[4006478550, 'VAD7274', 'VA', 'PAS', datetime.date(2016, 10, 5), 5, '4D', 'BMW', 'BUS LANE VIOLATION']
[4006462396, '22834JK', 'NY', 'COM', datetime.date(2016, 9, 30), 5, 'VAN', 'CHEVR', 'BUS LANE VIOLATION']
[4007117810, '21791MG', 'NY', 'COM', datetime.date(2017, 4, 10), 5, 'VAN', 'DODGE', 'BUS LANE VIOLATION']
[4006265037, 'FZX9232', 'NY', 'PAS', datetime.date(2016, 8, 23), 5, 'SUBN', 'FORD', 'BUS LANE VIOLATION']
[4006535600, 'N203399C', 'NY', 'OMT', datetime.date(2016, 10, 19), 5, 'SUBN', 'FORD', 'BUS LANE VIOLATION']


In [119]:
all([10,'hello'])

True

In [120]:
all([10,'hello',None])

False

In [90]:
all([10,'hello',''])

False

In [121]:
l = [10,'',0]
[item is not None for item in l]

[True, True, True]

In [122]:
all(item is not None for item in l)

True

In [130]:
def parse_row(row,*,default= None):
    fields = row.strip('\n').split(',')
    parsed_data = [func(field) for func, field in zip(column_parsers,fields)]
    if all(item is not None for item in parsed_data):
        return Ticket(*parsed_data)
    else:
        return default

In [131]:
row = read_data()
for _ in range(5):
    row = next(rows)
    parsed_data = parse_row(row)
    print(parsed_data)

Ticket(summons_number=4007306795, plate_id='HLG4926', registration_state='NY', plate_type='PAS', issue_date=datetime.date(2017, 5, 30), violation_code=5, vehicle_body_type='SUBN', vehicle_make='TOYOT', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4007124590, plate_id='T715907C', registration_state='NY', plate_type='OMT', issue_date=datetime.date(2017, 4, 3), violation_code=5, vehicle_body_type='SUBN', vehicle_make='TOYOT', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=5096061966, plate_id='HRC9475', registration_state='NY', plate_type='PAS', issue_date=datetime.date(2017, 4, 18), violation_code=7, vehicle_body_type='SUBN', vehicle_make='CADIL', violation_description='FAILURE TO STOP AT RED LIGHT')
Ticket(summons_number=5094070400, plate_id='DYP8042', registration_state='NY', plate_type='PAS', issue_date=datetime.date(2016, 10, 26), violation_code=7, vehicle_body_type='SUBN', vehicle_make='CHEVR', violation_description='FAILURE TO STOP AT RED LIG

In [136]:
for row in read_data():
    parsed_row = parse_row(row)
    if parsed_row is None:
        print(list(zip(column_names,row.strip('\n').split(' '))),end = '\n\n')

[('summons_number', '1413358512,54295PC,NY,APP,8/9/2016,19,BUS,,')]

[('summons_number', '1418425369,JYW5248,PA,PAS,3/21/2017,21,SDN,,')]

[('summons_number', '1406925068,19358JU,99,COM,8/23/2016,46,DELV,,')]

[('summons_number', '8546468965,37489BB,NY,OMR,6/12/2017,46,BUS,,46A-Double'), ('plate_id', 'Parking'), ('registration_state', '(Non-COM)')]

[('summons_number', '1406927442,BLANKPLATE,99,999,8/20/2016,46,SDN,,')]



In [138]:
def parsed_data():
    for row in read_data():
        parsed = parse_row(row)
        if parsed:
            yield parsed

In [140]:
parsed_rows = parsed_data()
for _ in range(5):
    print(next(parsed_rows))

Ticket(summons_number=4006478550, plate_id='VAD7274', registration_state='VA', plate_type='PAS', issue_date=datetime.date(2016, 10, 5), violation_code=5, vehicle_body_type='4D', vehicle_make='BMW', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4006462396, plate_id='22834JK', registration_state='NY', plate_type='COM', issue_date=datetime.date(2016, 9, 30), violation_code=5, vehicle_body_type='VAN', vehicle_make='CHEVR', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4007117810, plate_id='21791MG', registration_state='NY', plate_type='COM', issue_date=datetime.date(2017, 4, 10), violation_code=5, vehicle_body_type='VAN', vehicle_make='DODGE', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4006265037, plate_id='FZX9232', registration_state='NY', plate_type='PAS', issue_date=datetime.date(2016, 8, 23), violation_code=5, vehicle_body_type='SUBN', vehicle_make='FORD', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4