### Correcting Validity

In [14]:
"""
Your task is to check the "productionStartYear" of the DBPedia autos datafile for valid values.
The following things should be done:
- OK check if the field "productionStartYear" contains a year
- OK check if the year is in range 1886-2014
- OK convert the value of the field to be just a year (not full datetime)
- the rest of the fields and values should stay the same
- if the value of the field is a valid year in the range as described above,
  write that line to the output_good file
- if the value of the field is not a valid year as described above, 
  write that line to the output_bad file
- OK discard rows (neither write to good nor bad) if the URI is not from dbpedia.org
- you should use the provided way of reading and writing data (DictReader and DictWriter)
  They will take care of dealing with the header.

You can write helper functions for checking the data and writing the files, but we will call only the 
'process_file' with 3 arguments (inputfile, output_good, output_bad).
"""
import csv
import pprint
import re
from datetime import datetime

INPUT_FILE = 'autos.csv'
OUTPUT_GOOD = 'autos-valid.csv'
OUTPUT_BAD = 'FIXME-autos.csv'

def process_file(input_file, output_good, output_bad):
    with open(input_file, "r") as f:
        reader = csv.DictReader(f)
        header = reader.fieldnames

        #COMPLETE THIS FUNCTION
        good_autos = []
        bad_autos  = []
        
        for row in reader:
            # discard rows if the URI is not from dbpedia.org
            if not row['URI'].startswith('http://dbpedia.org/'):
                continue
            
            prod_start_year = row['productionStartYear']
            # check if the field "productionStartYear" contains a year
            if prod_start_year == 'NULL' or not re.match('^[0-9]{4}.*', prod_start_year):
                bad_autos.append(row)
            else:
                prod_datetime = datetime.strptime(prod_start_year, '%Y-%m-%dT%H:%M:%S+02:00')
                
                # convert the value of the field to be just a year (not full datetime)
                prod_year = prod_datetime.year
                row['productionStartYear'] = prod_year
                
                # check if the year is in range 1886-2014
                if prod_year < 1886 or prod_year > 2014:
                    bad_autos.append(row)
                else:
                    good_autos.append(row)
        #print bad_autos
        write_output_file(output_bad, bad_autos)
        write_output_file(output_good, good_autos)

def write_output_file(filename, data):
    # This is just an example on how you can use csv.DictWriter
    # Remember that you have to output 2 files
    with open(filename, "w") as g:
        writer = csv.DictWriter(g, delimiter=",", fieldnames=data[0].keys())
        writer.writeheader()
        for row in data:
            writer.writerow(row)


def test():

    process_file(INPUT_FILE, OUTPUT_GOOD, OUTPUT_BAD)


if __name__ == "__main__":
    test()