# Data Quality

## Example Using Our Blueprint

[OpenStreetMap data](https://mapzen.com/data/metro-extracts/)

In [4]:
# get different types of street names

import xml.etree.cElementTree as ET
from collections import defaultdict
import re

osm_file = open("data/charlotte.osm", "r")

street_type_re = re.compile(r'\S+\.?$', re.IGNORECASE)
street_types = defaultdict(int)

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        street_types[street_type] += 1

def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key=lambda s: s.lower())
    for k in keys:
        v = d[k]
        print("%s: %d" % (k, v))

def is_street_name(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == "addr:street")

def audit():
    for event, elem in ET.iterparse(osm_file):
        if is_street_name(elem):
            audit_street_type(street_types, elem.attrib['v'])    
    print_sorted_dict(street_types)    



audit()

#4: 1
Ardsley: 1
Ave: 2
Ave.: 2
Avenue: 30
Blvd: 4
Boulevard: 34
Dr: 2
Drive: 27
Lane: 4
Place: 16
Road: 56
St: 2
Street: 48
Way: 2


## Correcting Validity

In [3]:
"""
Your task is to check the "productionStartYear" of the DBPedia autos datafile for valid values.
The following things should be done:
- check if the field "productionStartYear" contains a year
- check if the year is in range 1886-2014
- convert the value of the field to be just a year (not full datetime)
- the rest of the fields and values should stay the same
- if the value of the field is a valid year in the range as described above,
  write that line to the output_good file
- if the value of the field is not a valid year as described above, 
  write that line to the output_bad file
- discard rows (neither write to good nor bad) if the URI is not from dbpedia.org
- you should use the provided way of reading and writing data (DictReader and DictWriter)
  They will take care of dealing with the header.
"""
import csv
import pprint

INPUT_FILE = 'data/autos.csv'
OUTPUT_GOOD = 'data/autos-valid.csv'
OUTPUT_BAD = 'data/FIXME-autos.csv'

def process_file(input_file, output_good, output_bad):

    data_good = []
    data_bad = []
    
    with open(input_file, "r") as f:
        reader = csv.DictReader(f)
        header = reader.fieldnames
        
        for row in reader:
            # validate URI value
            if row['URI'].find('dbpedia.org') < 0:
                continue
                
            ps_year = row['productionStartYear'][:4]
            try:  # use try/except to filter valid items
                ps_year = int(ps_year)
                row['productionStartYear'] = ps_year
                if (ps_year >= 1886) and (ps_year <= 2014):
                    data_good.append(row)
                else:
                    data_bad.append(row)
            except ValueError:  # non-numeric strings caught by exception
                if ps_year == 'Null':
                    data_bad.append(row)
                

    with open(output_good, "w") as good:
        writer = csv.DictWriter(good, delimiter=",", fieldnames= header)
        writer.writeheader()
        for row in data_good:
            writer.writerow(row)

    with open(output_bad, "w") as bad:
        writer = csv.DictWriter(bad, delimiter=",", fieldnames= header)
        writer.writeheader()
        for row in data_bad:
            writer.writerow(row)
            

process_file(INPUT_FILE, OUTPUT_GOOD, OUTPUT_BAD)