# Project for Generators

- Create a lazy iterator that will return a named tuple of the data in each row. 
- The data types should be appropriate - i.e. if the column is a date, you should be storing dates in the named tuple, if the field is an integer, then it should be stored as an integer, etc.
- Calculate the number of violations by car make.

In [1]:
file_name = "nyc_parking_tickets_extract.csv"

In [2]:
with open(file_name) as f:
    for _ in range(10):
        print(next(f))

Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Violation Description

4006478550,VAD7274,VA,PAS,10/5/2016,5,4D,BMW,BUS LANE VIOLATION

4006462396,22834JK,NY,COM,9/30/2016,5,VAN,CHEVR,BUS LANE VIOLATION

4007117810,21791MG,NY,COM,4/10/2017,5,VAN,DODGE,BUS LANE VIOLATION

4006265037,FZX9232,NY,PAS,8/23/2016,5,SUBN,FORD,BUS LANE VIOLATION

4006535600,N203399C,NY,OMT,10/19/2016,5,SUBN,FORD,BUS LANE VIOLATION

4007156700,92163MG,NY,COM,4/13/2017,5,VAN,FRUEH,BUS LANE VIOLATION

4006687989,MIQ600,SC,PAS,11/21/2016,5,VN,HONDA,BUS LANE VIOLATION

4006943052,2AE3984,MD,PAS,2/1/2017,5,SW,LINCO,BUS LANE VIOLATION

4007306795,HLG4926,NY,PAS,5/30/2017,5,SUBN,TOYOT,BUS LANE VIOLATION



In [4]:
with open(file_name) as f:
    column_headers = next(f).strip('\n').split(',')
    sample_data = next(f).strip('\n').split(',')

In [5]:
column_headers

['Summons Number',
 'Plate ID',
 'Registration State',
 'Plate Type',
 'Issue Date',
 'Violation Code',
 'Vehicle Body Type',
 'Vehicle Make',
 'Violation Description']

In [6]:
sample_data

['4006478550',
 'VAD7274',
 'VA',
 'PAS',
 '10/5/2016',
 '5',
 '4D',
 'BMW',
 'BUS LANE VIOLATION']

In [7]:
column_names = [header.replace(' ', '_').lower() 
                for header in column_headers]

In [8]:
column_names

['summons_number',
 'plate_id',
 'registration_state',
 'plate_type',
 'issue_date',
 'violation_code',
 'vehicle_body_type',
 'vehicle_make',
 'violation_description']

In [9]:
list(zip(column_names, sample_data))

[('summons_number', '4006478550'),
 ('plate_id', 'VAD7274'),
 ('registration_state', 'VA'),
 ('plate_type', 'PAS'),
 ('issue_date', '10/5/2016'),
 ('violation_code', '5'),
 ('vehicle_body_type', '4D'),
 ('vehicle_make', 'BMW'),
 ('violation_description', 'BUS LANE VIOLATION')]

### Data Type

- 'summons_number': `int`
- 'plate_id': `str`
- 'registration_state': `str`
- 'plate_type': `str`
- 'issue_date': `date`
- 'violation_code': `int`
- 'vehicle_body_type': `str`
- 'vehicle_make': `str`
- 'violation_description: `str`

In [10]:
from collections import namedtuple

In [11]:
Ticket = namedtuple('Ticket', column_names)

In [12]:
issubclass(Ticket, tuple)

True

In [13]:
with open(file_name) as f:
    next(f)
    raw_data_row = next(f)

In [14]:
raw_data_row

'4006478550,VAD7274,VA,PAS,10/5/2016,5,4D,BMW,BUS LANE VIOLATION\n'

In [15]:
def read_data():
    with open(file_name) as f:
        next(f) # Header is removed
        yield from f

In [16]:
raw_data = read_data()

In [17]:
raw_data

<generator object read_data at 0x00000160560ECAC0>

In [15]:
for _ in range(5):
    print(next(raw_data))

4006478550,VAD7274,VA,PAS,10/5/2016,5,4D,BMW,BUS LANE VIOLATION

4006462396,22834JK,NY,COM,9/30/2016,5,VAN,CHEVR,BUS LANE VIOLATION

4007117810,21791MG,NY,COM,4/10/2017,5,VAN,DODGE,BUS LANE VIOLATION

4006265037,FZX9232,NY,PAS,8/23/2016,5,SUBN,FORD,BUS LANE VIOLATION

4006535600,N203399C,NY,OMT,10/19/2016,5,SUBN,FORD,BUS LANE VIOLATION



In [19]:
def parse_int(value, *, default=None):
    try:
        return int(value)
    except ValueError:
        return default

In [20]:
from datetime import datetime

def parse_date(value, *, default=None):
    date_format = '%m/%d/%Y'
    try:
        return datetime.strptime(value, date_format).date()
    except ValueError:
        return default

In [21]:
parse_int('hello', default='N/A')

'N/A'

In [21]:
parse_date('3/28/2018')

datetime.date(2018, 3, 28)

In [22]:
parse_date('2324234', default='N/A')

'N/A'

In [23]:
def parse_string(value, *, default=None):
    try:
        cleaned = value.strip()
        if not cleaned:
            return default
        else:
            return cleaned
    except:
        return default

In [24]:
parse_string('    himanshu   ')

'himanshu'

In [26]:
parse_string('    ', default='N/A')

'N/A'

In [27]:
from functools import partial

column_parsers = (partial(parse_int), 
                  partial(parse_string), 
                  partial(parse_string, default=''), 
                  partial(parse_string, default=''), 
                  partial(parse_date), 
                  partial(parse_int), 
                  partial(parse_string, default=''), 
                  partial(parse_string), 
                  partial(parse_string, default=''))

In [28]:
def parse_row(row):
    fields = row.strip('\n').split(',')
    parsed_data = (func(field) for func, field in zip(column_parsers, fields))
    return parsed_data

In [29]:
rows = read_data()
for _ in range(5):
    row = next(rows)
    parsed_data = parse_row(row)
    print(list(parsed_data))

[4006478550, 'VAD7274', 'VA', 'PAS', datetime.date(2016, 10, 5), 5, '4D', 'BMW', 'BUS LANE VIOLATION']
[4006462396, '22834JK', 'NY', 'COM', datetime.date(2016, 9, 30), 5, 'VAN', 'CHEVR', 'BUS LANE VIOLATION']
[4007117810, '21791MG', 'NY', 'COM', datetime.date(2017, 4, 10), 5, 'VAN', 'DODGE', 'BUS LANE VIOLATION']
[4006265037, 'FZX9232', 'NY', 'PAS', datetime.date(2016, 8, 23), 5, 'SUBN', 'FORD', 'BUS LANE VIOLATION']
[4006535600, 'N203399C', 'NY', 'OMT', datetime.date(2016, 10, 19), 5, 'SUBN', 'FORD', 'BUS LANE VIOLATION']


In [40]:
def parse_row(row, *, default=None):
    fields = row.strip('\n').split(',')
    parsed_data = [func(field) for func, field in zip(column_parsers, fields)]
    if all(item is not None for item in parsed_data):
        return Ticket(*parsed_data)
    else:
        return default

In [41]:
rows = read_data()
for _ in range(5):
    row = next(rows)
    parsed_data = parse_row(row)
    print(parsed_data)

Ticket(summons_number=4006478550, plate_id='VAD7274', registration_state='VA', plate_type='PAS', issue_date=datetime.date(2016, 10, 5), violation_code=5, vehicle_body_type='4D', vehicle_make='BMW', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4006462396, plate_id='22834JK', registration_state='NY', plate_type='COM', issue_date=datetime.date(2016, 9, 30), violation_code=5, vehicle_body_type='VAN', vehicle_make='CHEVR', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4007117810, plate_id='21791MG', registration_state='NY', plate_type='COM', issue_date=datetime.date(2017, 4, 10), violation_code=5, vehicle_body_type='VAN', vehicle_make='DODGE', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4006265037, plate_id='FZX9232', registration_state='NY', plate_type='PAS', issue_date=datetime.date(2016, 8, 23), violation_code=5, vehicle_body_type='SUBN', vehicle_make='FORD', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4

In [42]:
for row in read_data():
    parsed_row = parse_row(row)
    if parsed_row is None:
        print(list(zip(column_names, row.strip('\n').split(','))), end='\n\n')

[('summons_number', '1413358512'), ('plate_id', '54295PC'), ('registration_state', 'NY'), ('plate_type', 'APP'), ('issue_date', '8/9/2016'), ('violation_code', '19'), ('vehicle_body_type', 'BUS'), ('vehicle_make', ''), ('violation_description', '')]

[('summons_number', '1418425369'), ('plate_id', 'JYW5248'), ('registration_state', 'PA'), ('plate_type', 'PAS'), ('issue_date', '3/21/2017'), ('violation_code', '21'), ('vehicle_body_type', 'SDN'), ('vehicle_make', ''), ('violation_description', '')]

[('summons_number', '1406925068'), ('plate_id', '19358JU'), ('registration_state', '99'), ('plate_type', 'COM'), ('issue_date', '8/23/2016'), ('violation_code', '46'), ('vehicle_body_type', 'DELV'), ('vehicle_make', ''), ('violation_description', '')]

[('summons_number', '8546468965'), ('plate_id', '37489BB'), ('registration_state', 'NY'), ('plate_type', 'OMR'), ('issue_date', '6/12/2017'), ('violation_code', '46'), ('vehicle_body_type', 'BUS'), ('vehicle_make', ''), ('violation_description'

In [43]:
def parsed_data():
    for row in read_data():
        parsed = parse_row(row)
        if parsed:
            yield parsed

In [44]:
parsed_rows = parsed_data()

In [45]:
for _ in range(5):
    print(next(parsed_rows))

Ticket(summons_number=4006478550, plate_id='VAD7274', registration_state='VA', plate_type='PAS', issue_date=datetime.date(2016, 10, 5), violation_code=5, vehicle_body_type='4D', vehicle_make='BMW', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4006462396, plate_id='22834JK', registration_state='NY', plate_type='COM', issue_date=datetime.date(2016, 9, 30), violation_code=5, vehicle_body_type='VAN', vehicle_make='CHEVR', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4007117810, plate_id='21791MG', registration_state='NY', plate_type='COM', issue_date=datetime.date(2017, 4, 10), violation_code=5, vehicle_body_type='VAN', vehicle_make='DODGE', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4006265037, plate_id='FZX9232', registration_state='NY', plate_type='PAS', issue_date=datetime.date(2016, 8, 23), violation_code=5, vehicle_body_type='SUBN', vehicle_make='FORD', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4

In [46]:
makes_counts = {}

for data in parsed_data():
    if data.vehicle_make in makes_counts:
        makes_counts[data.vehicle_make] += 1
    else:
        makes_counts[data.vehicle_make] = 1

print(makes_counts)

{'BMW': 34, 'CHEVR': 76, 'DODGE': 45, 'FORD': 104, 'FRUEH': 44, 'HONDA': 106, 'LINCO': 12, 'TOYOT': 112, 'CADIL': 9, 'CHRYS': 12, 'FIR': 1, 'GMC': 35, 'HYUND': 35, 'JAGUA': 3, 'JEEP': 22, 'LEXUS': 26, 'ME/BE': 38, 'MERCU': 4, 'MITSU': 11, 'NISSA': 70, 'HIN': 6, 'NS/OT': 18, 'WORKH': 2, 'ACURA': 12, 'AUDI': 12, 'INTER': 25, 'ISUZU': 10, 'KENWO': 5, 'KIA': 8, 'OLDSM': 1, 'SUBAR': 18, 'VOLVO': 12, 'SATUR': 2, 'SMART': 3, 'INFIN': 13, 'PETER': 1, 'CITRO': 1, 'ROVER': 5, 'BUICK': 5, 'GEO': 1, 'MAZDA': 5, 'PORSC': 3, 'VOLKS': 8, 'YAMAH': 1, 'BSA': 1, 'MINI': 1, 'PONTI': 1, 'SPRI': 1, 'PLYMO': 1, 'SCION': 2, 'UPS': 1, 'FIAT': 1, 'UD': 1, 'UTILI': 1, 'GMCQ': 1, 'SAAB': 2, 'HINO': 2, 'STAR': 1, 'AM/T': 1, 'MI/F': 1}


In [47]:
for make, cnt in sorted(makes_counts.items(), key=lambda t: t[1], reverse=True):
    print(make, cnt)

TOYOT 112
HONDA 106
FORD 104
CHEVR 76
NISSA 70
DODGE 45
FRUEH 44
ME/BE 38
GMC 35
HYUND 35
BMW 34
LEXUS 26
INTER 25
JEEP 22
NS/OT 18
SUBAR 18
INFIN 13
LINCO 12
CHRYS 12
ACURA 12
AUDI 12
VOLVO 12
MITSU 11
ISUZU 10
CADIL 9
KIA 8
VOLKS 8
HIN 6
KENWO 5
ROVER 5
BUICK 5
MAZDA 5
MERCU 4
JAGUA 3
SMART 3
PORSC 3
WORKH 2
SATUR 2
SCION 2
SAAB 2
HINO 2
FIR 1
OLDSM 1
PETER 1
CITRO 1
GEO 1
YAMAH 1
BSA 1
MINI 1
PONTI 1
SPRI 1
PLYMO 1
UPS 1
FIAT 1
UD 1
UTILI 1
GMCQ 1
STAR 1
AM/T 1
MI/F 1


In [48]:
from collections import defaultdict

makes_counts = defaultdict(int)

for data in parsed_data():
    makes_counts[data.vehicle_make] += 1

for make, cnt in sorted(makes_counts.items(), key=lambda t: t[1], reverse=True):
    print(make, cnt)

TOYOT 112
HONDA 106
FORD 104
CHEVR 76
NISSA 70
DODGE 45
FRUEH 44
ME/BE 38
GMC 35
HYUND 35
BMW 34
LEXUS 26
INTER 25
JEEP 22
NS/OT 18
SUBAR 18
INFIN 13
LINCO 12
CHRYS 12
ACURA 12
AUDI 12
VOLVO 12
MITSU 11
ISUZU 10
CADIL 9
KIA 8
VOLKS 8
HIN 6
KENWO 5
ROVER 5
BUICK 5
MAZDA 5
MERCU 4
JAGUA 3
SMART 3
PORSC 3
WORKH 2
SATUR 2
SCION 2
SAAB 2
HINO 2
FIR 1
OLDSM 1
PETER 1
CITRO 1
GEO 1
YAMAH 1
BSA 1
MINI 1
PONTI 1
SPRI 1
PLYMO 1
UPS 1
FIAT 1
UD 1
UTILI 1
GMCQ 1
STAR 1
AM/T 1
MI/F 1


In [49]:
def violation_count_by_make():
    makes_counts = defaultdict(int)
    for data in parsed_data():
        makes_counts[data.vehicle_make] += 1

    return sorted(makes_counts.items(), key=lambda t: t[1], reverse=True)

In [50]:
violation_count_by_make()

[('TOYOT', 112),
 ('HONDA', 106),
 ('FORD', 104),
 ('CHEVR', 76),
 ('NISSA', 70),
 ('DODGE', 45),
 ('FRUEH', 44),
 ('ME/BE', 38),
 ('GMC', 35),
 ('HYUND', 35),
 ('BMW', 34),
 ('LEXUS', 26),
 ('INTER', 25),
 ('JEEP', 22),
 ('NS/OT', 18),
 ('SUBAR', 18),
 ('INFIN', 13),
 ('LINCO', 12),
 ('CHRYS', 12),
 ('ACURA', 12),
 ('AUDI', 12),
 ('VOLVO', 12),
 ('MITSU', 11),
 ('ISUZU', 10),
 ('CADIL', 9),
 ('KIA', 8),
 ('VOLKS', 8),
 ('HIN', 6),
 ('KENWO', 5),
 ('ROVER', 5),
 ('BUICK', 5),
 ('MAZDA', 5),
 ('MERCU', 4),
 ('JAGUA', 3),
 ('SMART', 3),
 ('PORSC', 3),
 ('WORKH', 2),
 ('SATUR', 2),
 ('SCION', 2),
 ('SAAB', 2),
 ('HINO', 2),
 ('FIR', 1),
 ('OLDSM', 1),
 ('PETER', 1),
 ('CITRO', 1),
 ('GEO', 1),
 ('YAMAH', 1),
 ('BSA', 1),
 ('MINI', 1),
 ('PONTI', 1),
 ('SPRI', 1),
 ('PLYMO', 1),
 ('UPS', 1),
 ('FIAT', 1),
 ('UD', 1),
 ('UTILI', 1),
 ('GMCQ', 1),
 ('STAR', 1),
 ('AM/T', 1),
 ('MI/F', 1)]