## Using Csv Module

In [7]:
import csv
import os

DATADIR = ""
DATAFILE = "745090.csv"

def parse_file(datafile):
    name = None
    data = []
    with open(datafile,'r') as f:
        firstLine = f.readline()
        name = firstLine.strip().split(',')[1]
        secondLine = f.readline()
        header = csv.reader(f)
        data = (list(header))
    # Do not change the line below
    return (name.replace('"',""), data)


def test():
    datafile = os.path.join(DATADIR, DATAFILE)
    name, data = parse_file(datafile)

    assert name == "MOUNTAIN VIEW MOFFETT FLD NAS"
    assert data[0][1] == "01:00"
    assert data[2][0] == "01/01/2005"
    assert data[2][5] == "2"


if __name__ == "__main__":
    test()  

## Using Xlrd Module

In [2]:
#!/usr/bin/env python
"""
Your task is as follows:
- read the provided Excel file
- find and return the min, max and average values for the COAST region
- find and return the time value for the min and max entries
- the time values should be returned as Python tuples

Please see the test function for the expected return format
"""

import xlrd
from zipfile import ZipFile
datafile = "2013_ERCOT_Hourly_Load_Data.xls"


def open_zip(datafile):
    with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
        myzip.extractall()


def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)

    ### example on how you can get the data
    #sheet_data = [[sheet.cell_value(r, col) for col in range(sheet.ncols)] for r in range(sheet.nrows)]

    ### other useful methods:
    # print "\nROWS, COLUMNS, and CELLS:"
    # print "Number of rows in the sheet:", 
    # print sheet.nrows
    # print "Type of data in cell (row 3, col 2):", 
    # print sheet.cell_type(3, 2)
    # print "Value in cell (row 3, col 2):", 
    # print sheet.cell_value(3, 2)
    # print "Get a slice of values in column 3, from rows 1-3:"
    # print sheet.col_values(3, start_rowx=1, end_rowx=4)

    # print "\nDATES:"
    # print "Type of data in cell (row 1, col 0):", 
    # print sheet.cell_type(1, 0)
    # exceltime = sheet.cell_value(1, 0)
    # print "Time in Excel format:",
    # print exceltime
    # print "Convert time to a Python datetime tuple, from the Excel float:",
    # print xlrd.xldate_as_tuple(exceltime, 0)
    
    
    col_values = sheet.col_values(1,start_rowx=1,end_rowx=None)
    
    max_val = max(col_values)
    min_val = min(col_values)
    
    max_pos = col_values.index(max_val) + 1
    min_pos = col_values.index(min_val) + 1
  
    min_time = xlrd.xldate_as_tuple(sheet.cell_value(min_pos,0), 0)
    max_time = xlrd.xldate_as_tuple(sheet.cell_value(max_pos,0), 0)
    
    data = {
            'maxtime': max_time,
            'maxvalue': max_val,
            'mintime': min_time,
            'minvalue': min_val,
            'avgcoast': sum(col_values) / float(len(col_values))
    }
    
    return data


def test():
    #open_zip(datafile)
    data = parse_file(datafile)

    assert data['maxtime'] == (2013, 8, 13, 17, 0, 0)
    assert round(data['maxvalue'], 10) == round(18779.02551, 10)


test()

parse_file(datafile)

{'avgcoast': 10976.933460679751,
 'maxtime': (2013, 8, 13, 17, 0, 0),
 'maxvalue': 18779.025510000003,
 'mintime': (2013, 2, 3, 4, 0, 0),
 'minvalue': 6602.113898999982}

In [8]:
# -*- coding: utf-8 -*-
'''
Find the time and value of max load for each of the regions
COAST, EAST, FAR_WEST, NORTH, NORTH_C, SOUTHERN, SOUTH_C, WEST
and write the result out in a csv file, using pipe character | as the delimiter.

An example output can be seen in the "example.csv" file.
'''

import xlrd
import os
import csv
from zipfile import ZipFile

datafile = "2013_ERCOT_Hourly_Load_Data.xls"
outfile = "2013_Max_Loads.csv"


def open_zip(datafile):
    with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
        myzip.extractall()


def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    data = []
    # YOUR CODE HERE
    # Remember that you can use xlrd.xldate_as_tuple(sometime, 0) to convert
    # Excel date to Python tuple of (year, month, day, hour, minute, second)
    
    for i in range(1,sheet.ncols-1):
        temp = {}
        temp["station_name"] = (sheet.cell_value(0,i))
        
        col_values = sheet.col_values(i,start_rowx=1,end_rowx=None)
        max_load = max(col_values)        
        max_pos = col_values.index(max_load) + 1
        max_date = xlrd.xldate_as_tuple(sheet.cell_value(max_pos,0),0)
        
        temp["max_load"] = max_load
        temp["time"] = max_date
        data.append(temp)
            
    return data
parse_file(datafile)

[{'max_load': 18779.025510000003,
  'station_name': 'COAST',
  'time': (2013, 8, 13, 17, 0, 0)},
 {'max_load': 2380.1654089999956,
  'station_name': 'EAST',
  'time': (2013, 8, 5, 17, 0, 0)},
 {'max_load': 2281.2722140000024,
  'station_name': 'FAR_WEST',
  'time': (2013, 6, 26, 17, 0, 0)},
 {'max_load': 1544.7707140000005,
  'station_name': 'NORTH',
  'time': (2013, 8, 7, 17, 0, 0)},
 {'max_load': 24415.570226999993,
  'station_name': 'NORTH_C',
  'time': (2013, 8, 7, 18, 0, 0)},
 {'max_load': 5494.157645,
  'station_name': 'SOUTHERN',
  'time': (2013, 8, 8, 16, 0, 0)},
 {'max_load': 11433.30491600001,
  'station_name': 'SOUTH_C',
  'time': (2013, 8, 8, 18, 0, 0)},
 {'max_load': 1862.6137649999998,
  'station_name': 'WEST',
  'time': (2013, 8, 7, 17, 0, 0)}]

In [9]:
data = parse_file(datafile)
def save_file(data, filename):
        with open(filename,'w') as f:
            writer = csv.writer(f, delimiter='|')
            writer.writerow(['Station','Year','Month','Day','Hour','Max Load'])
            
            for row in data:
                temp = []
                temp.append(row['station_name'])
                temp.extend(list(row['time'])[:-2])
                temp.append(row['max_load'])
                print (temp)
                writer.writerow(temp)
                        
save_file(data, "example.csv")  

['COAST', 2013, 8, 13, 17, 18779.025510000003]
['EAST', 2013, 8, 5, 17, 2380.1654089999956]
['FAR_WEST', 2013, 6, 26, 17, 2281.2722140000024]
['NORTH', 2013, 8, 7, 17, 1544.7707140000005]
['NORTH_C', 2013, 8, 7, 18, 24415.570226999993]
['SOUTHERN', 2013, 8, 8, 16, 5494.157645]
['SOUTH_C', 2013, 8, 8, 18, 11433.30491600001]
['WEST', 2013, 8, 7, 17, 1862.6137649999998]


## Using Element Tree Module

In [10]:
#!/usr/bin/env python
# Your task here is to extract data from xml on authors of an article
# and add it to a list, one item for an author.
# See the provided data structure for the expected format.
# The tags for first name, surname and email should map directly
# to the dictionary keys
import xml.etree.ElementTree as ET

article_file = "exampleResearchArticle.xml"


def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": None,
                "snm": None,
                "email": None,
                "insr": []
        }

        # YOUR CODE HERE
        data["fnm"] = author.find('./fnm').text
        data["snm"] = author.find('./snm').text
        data["email"] = author.find('./email').text
        for child in author.findall('./insr'):
            data["insr"].append(child.attrib['iid'])
       
        authors.append(data)

    return authors


root = get_root(article_file)
get_authors(root)

[{'email': 'omer@extremegate.com',
  'fnm': 'Omer',
  'insr': ['I1'],
  'snm': 'Mei-Dan'},
 {'email': 'mcarmont@hotmail.com',
  'fnm': 'Mike',
  'insr': ['I2'],
  'snm': 'Carmont'},
 {'email': 'laver17@gmail.com',
  'fnm': 'Lior',
  'insr': ['I3', 'I4'],
  'snm': 'Laver'},
 {'email': 'nyska@internet-zahav.net',
  'fnm': 'Meir',
  'insr': ['I3'],
  'snm': 'Nyska'},
 {'email': 'kammarh@gmail.com',
  'fnm': 'Hagay',
  'insr': ['I8'],
  'snm': 'Kammar'},
 {'email': 'gideon.mann.md@gmail.com',
  'fnm': 'Gideon',
  'insr': ['I3', 'I5'],
  'snm': 'Mann'},
 {'email': 'barns.nz@gmail.com',
  'fnm': 'Barnaby',
  'insr': ['I6'],
  'snm': 'Clarck'},
 {'email': 'eukots@gmail.com', 'fnm': 'Eugene', 'insr': ['I7'], 'snm': 'Kots'}]

## Using Beautiful Soup Module

In [13]:
from bs4 import BeautifulSoup 

def options(soup,id):
    option_values = []
    carrier_list = soup.find(id=id)
    for option in carrier_list.find_all('option'):
        option_values.append(option['value'])
    return option_values

def print_list(label,codes):
    print("\n%s:" %label)
    newStr = ''
    for c in codes:
        newStr += (c) + " "
    print (newStr,"\n")
    
soup = BeautifulSoup(open("Data_Elements.html"), "lxml")
    
codes = options(soup,'CarrierList')
print_list("Carriers",codes)
    
codes = options(soup,'AirportList')
#print_list("Airports",codes)



Carriers:
All AllUS AllForeign AS G4 AA 5Y DL MQ EV F9 HA B6 OO WN NK UA VX  



In [15]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Please note that the function 'make_request' is provided for your reference only.
# You will not be able to to actually use it from within the Udacity web UI.
# Your task is to process the HTML using BeautifulSoup, extract the hidden
# form field values for "__EVENTVALIDATION" and "__VIEWSTATE" and set the appropriate
# values in the data dictionary.
# All your changes should be in the 'extract_data' function
from bs4 import BeautifulSoup
import requests
import json

html_page = "Data_Elements.html"


def extract_data(page):
    data = {"eventvalidation": "",
            "viewstate": ""}
    with open(page, "r") as html:
        # do something here to find the necessary values
        soup = BeautifulSoup(html,"lxml")
        data["eventvalidation"] = soup.find(id="__EVENTVALIDATION")['value']
        data["viewstate"] = soup.find(id="__VIEWSTATE")['value']
        print (data["eventvalidation"])
        print (data["viewstate"])

    return data


def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]

    r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                    data={'AirportList': "BOS",
                          'CarrierList': "VX",
                          'Submit': 'Submit',
                          "__EVENTTARGET": "",
                          "__EVENTARGUMENT": "",
                          "__EVENTVALIDATION": eventvalidation,
                          "__VIEWSTATE": viewstate
                    })

    return r.text

#extract_data(html_page)

In [42]:
import requests
from bs4 import BeautifulSoup

s = requests.Session()
r = s.get("https://www.transtats.bts.gov/Data_Elements.aspx?Data=2")
soup = BeautifulSoup(r.text,"lxml")
viewstate_element = soup.find(id="__VIEWSTATE")
viewsate = viewstate_element['value']
eventvalidation_element = soup.find(id="__EVENTVALIDATION")
eventvalidation = eventvalidation_element['value']

r = s.post("https://www.transtats.bts.gov/Data_Elements.aspx?Data=2",data={'AirportList':"BOS",
                                                                                 'CarrierList':"VX",
                                                                                 '__EVENTTARGET':"",
                                                                                '__EVENTARGUMENT':"",
                                                                                 '__EVENTVALIDATION':eventvalidation,
                                                                                 '__VIEWSTATE':viewsate})

f = open('virgin_and_logan_airport.html',"w")
f.write(r.text)

428933

#### Carrier List

In [2]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Your task in this exercise is to modify 'extract_carrier()` to get a list of
all airlines. Exclude all of the combination values like "All U.S. Carriers"
from the data that you return. You should return a list of codes for the
carriers.

All your changes should be in the 'extract_carrier()' function. The
'options.html' file in the tab above is a stripped down version of what is
actually on the website, but should provide an example of what you should get
from the full file.

Please note that the function 'make_request()' is provided for your reference
only. You will not be able to to actually use it from within the Udacity web UI.
"""

from bs4 import BeautifulSoup
html_page = "options.html"


def extract_carriers(page):
    data = []

    with open(page, "r") as html:
        # do something here to find the necessary values
        soup = BeautifulSoup(html, "lxml")
        carrier_list = soup.find(id="CarrierList")
        for option in carrier_list.find_all('option'):
            if option['value'] not in ['All','AllUS','AllForeign']:
                data.append(option['value'])
        print (len(data))
    return data


def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]
    airport = data["airport"]
    carrier = data["carrier"]

    r = s.post("https://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
               data = (("__EVENTTARGET", ""),
                       ("__EVENTARGUMENT", ""),
                       ("__VIEWSTATE", viewstate),
                       ("__VIEWSTATEGENERATOR",viewstategenerator),
                       ("__EVENTVALIDATION", eventvalidation),
                       ("CarrierList", carrier),
                       ("AirportList", airport),
                       ("Submit", "Submit")))

    return r.text



extract_carriers(html_page)

16


['FL',
 'AS',
 'AA',
 'MQ',
 '5Y',
 'DL',
 'EV',
 'F9',
 'HA',
 'B6',
 'OO',
 'WN',
 'NK',
 'US',
 'UA',
 'VX']

### Airport List

In [4]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Complete the 'extract_airports()' function so that it returns a list of airport
codes, excluding any combinations like "All".

Refer to the 'options.html' file in the tab above for a stripped down version
of what is actually on the website. The test() assertions are based on the
given file.
"""

from bs4 import BeautifulSoup
html_page = "options.html"


def extract_airports(page):
    data = []

    with open(page, "r") as html:
        # do something here to find the necessary values
        soup = BeautifulSoup(html, "lxml")
        carrier_list = soup.find(id="AirportList")
        for option in carrier_list.find_all('option'):
            if option['value'] not in ['All','AllMajors','AllOthers']:
                data.append(option['value'])
        print (len(data))
    return data



extract_airports(html_page)  

15


['ATL',
 'BWI',
 'BOS',
 'CLT',
 'MDW',
 'ORD',
 'DFW',
 'DEN',
 'DTW',
 'FLL',
 'IAH',
 'LAS',
 'LAX',
 'ABR',
 'ABI']

### Processing All Files

In [49]:
def process_file(f):
    """
    This function extracts data from the file given as the function argument in
    a list of dictionaries. This is example of the data structure you should
    return:

    data = [{"courier": "FL",
             "airport": "ATL",
             "year": 2012,
             "month": 12,
             "flights": {"domestic": 100,
                         "international": 100}
            },
            {"courier": "..."}
    ]


    Note - year, month, and the flight data should be integers.
    You should skip the rows that contain the TOTAL data for a year.
    """
    data = []
    info = {}
    info["courier"], info["airport"] = f[:6].split("-")
    # Note: create a new dictionary for each entry in the output data list.
    # If you use the info dictionary defined here each element in the list 
    # will be a reference to the same info dictionary.
    with open(f, "r") as html:
        soup = BeautifulSoup(html,"lxml")
        table_data = soup.find(id = "DataGrid1")
        for tr in table_data.find_all('tr',class_ = "dataTDRight"):
            
            temp_lst = []
            for td in tr.find_all('td'):
                temp_lst.append(td.text)
            if temp_lst[1] != "TOTAL":
                temp_dict = {}
                temp_dict["courier"] = info["courier"]
                temp_dict["airport"] = info["airport"]
                temp_dict["year"] = int(temp_lst[0])
                temp_dict["month"] = int(temp_lst[1])
                temp={}
                temp["domestic"] = int(temp_lst[2].replace(',',''))
                temp["international"] = int(temp_lst[3].replace(',',''))
                temp_dict["flights"] = temp
                data.append(temp_dict)
               
    return data

process_file("FL-ATL.html")

[{'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 815489, 'international': 92565},
  'month': 10,
  'year': 2002},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 766775, 'international': 91342},
  'month': 11,
  'year': 2002},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 782175, 'international': 96881},
  'month': 12,
  'year': 2002},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 785651, 'international': 98053},
  'month': 1,
  'year': 2003},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 690750, 'international': 85965},
  'month': 2,
  'year': 2003},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 797634, 'international': 97929},
  'month': 3,
  'year': 2003},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 766639, 'international': 89398},
  'month': 4,
  'year': 2003},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 789857, 'international': 8767

### Splitting A File With Multiple Xmls Docs Into Multiple Files with Single Xml Doc

In [21]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# So, the problem is that the gigantic file is actually not a valid XML, because
# it has several root elements, and XML declarations.
# It is, a matter of fact, a collection of a lot of concatenated XML documents.
# So, one solution would be to split the file into separate documents,
# so that you can process the resulting files as valid XML documents.

PATENTS = 'patent.data'

def split_file(filename):
    """
    Split the input file into separate files, each containing a single patent.
    As a hint - each patent declaration starts with the same line that was
    causing the error found in the previous exercises.
    
    The new files should be saved with filename in the following format:
    "{}-{}".format(filename, n) where n is a counter, starting from 0.
    """
    count  = 0 
    with open(filename,'r') as f:
        for line in f:
            if line.startswith('<?xml') :
                fname = "{}-{}".format(filename, count)
                count  += 1
                with open(fname,'w') as f1:
                    f1.write(line)
                
split_file(PATENTS)

## Data Validity Problem Solve

In [3]:
"""
Your task is to check the "productionStartYear" of the DBPedia autos datafile for valid values.
The following things should be done:
- check if the field "productionStartYear" contains a year
- check if the year is in range 1886-2014
- convert the value of the field to be just a year (not full datetime)
- the rest of the fields and values should stay the same
- if the value of the field is a valid year in the range as described above,
  write that line to the output_good file
- if the value of the field is not a valid year as described above, 
  write that line to the output_bad file
- discard rows (neither write to good nor bad) if the URI is not from dbpedia.org
- you should use the provided way of reading and writing data (DictReader and DictWriter)
  They will take care of dealing with the header.

You can write helper functions for checking the data and writing the files, but we will call only the 
'process_file' with 3 arguments (inputfile, output_good, output_bad).
"""
import csv
import pprint

INPUT_FILE = 'autos.csv'
OUTPUT_GOOD = 'autos-valid.csv'
OUTPUT_BAD = 'FIXME-autos.csv'

def process_file(input_file, output_good, output_bad):

    with open(input_file, "r") as f:
        reader = csv.DictReader(f)
        header = reader.fieldnames.
        
        good_data = []
        bad_data = []
        
        for row in reader:
            if 'dbpedia' in row['URI']:
                temp_year = row['productionStartYear']
                if temp_year is not None and temp_year != '' and temp_year != 'NULL':
                    row['productionStartYear'] = year = int(temp_year.split('-')[0])
                    if year > 1886 and year < 2014:
                        good_data.append(row)
                    else:
                        bad_data.append(row)
                else:
                    bad_data.append(row)
    
        writeToFile(output_good,good_data,header)
        writeToFile(output_bad,bad_data,header)

    
def writeToFile(filename,YOURDATA,header):
    with open(filename, "w") as g:
        writer = csv.DictWriter(g, delimiter=",", fieldnames= header)
        writer.writeheader()
        for row in YOURDATA:
            writer.writerow(row)


process_file(INPUT_FILE, OUTPUT_GOOD, OUTPUT_BAD)

## Problem Set : Data Quality

###  1. Auditing Data Quality

In [13]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with cities infobox data, audit it, come up with a
cleaning idea and then clean it up. In the first exercise we want you to audit
the datatypes that can be found in some particular fields in the dataset.
The possible types of values can be:
- NoneType if the value is a string "NULL" or an empty string ""
- list, if the value starts with "{"
- int, if the value can be cast to int
- float, if the value can be cast to float, but CANNOT be cast to int.
   For example, '3.23e+07' should be considered a float because it can be cast
   as float but int('3.23e+07') will throw a ValueError
- 'str', for all other values

The audit_file function should return a dictionary containing fieldnames and a 
SET of the types that can be found in the field. e.g.
{"field1": set([type(float()), type(int()), type(str())]),
 "field2": set([type(str())]),
  ....
}
The type() function returns a type object describing the argument given to the 
function. You can also use examples of objects to create type objects, e.g.
type(1.1) for a float: see the test function below for examples.

Note that the first three rows (after the header row) in the cities.csv file
are not actual data points. The contents of these rows should note be included
when processing data types. Be sure to include functionality in your code to
skip over or detect these rows.
"""
import codecs
import csv
import json
import pprint

CITIES = 'cities.csv'

FIELDS = ["name", "timeZone_label", "utcOffset", "homepage", "governmentType_label",
          "isPartOf_label", "areaCode", "populationTotal", "elevation",
          "maximumElevation", "minimumElevation", "populationDensity",
          "wgs84_pos#lat", "wgs84_pos#long", "areaLand", "areaMetro", "areaUrban"]

def isNoneType(val):
    if val == '' or val == 'NULL':
        return True
    else :
        return False
    
def isArray(val):
    if val.startswith('{'):
        return True
    else:
        return False
    
def isInt(val):
    try:
        int(val)
        return True
    except ValueError:
        return False
    
def isFloat(val):
    try:
        float(val)
        return True
    except ValueError:
        return False        

def audit_file(filename, fields):
    fieldtypes = {}
    # YOUR CODE HERE
    for val in fields:
        fieldtypes[val] = set()
    
    reader = csv.DictReader(open('cities.csv','r'))
    
    for i in range(3):
        next(reader)
    
    for row in reader:
        #print(row['areaLand'])
        for field in fields:
            if isNoneType(row[field]):
                fieldtypes[field].add(type(None))
            elif isArray(row[field]):
                fieldtypes[field].add( type([]))
            elif isInt(row[field]):
                fieldtypes[field].add(type(1))
            elif isFloat(row[field]):
                fieldtypes[field].add(type(1.1))    
            else:
                fieldtypes[field].add(type('aa')) 

    return fieldtypes

audit_file(CITIES, FIELDS)

{'areaCode': {int, str, NoneType},
 'areaLand': {list, NoneType, float},
 'areaMetro': {NoneType, float},
 'areaUrban': {NoneType, float},
 'elevation': {list, NoneType, float},
 'governmentType_label': {str, NoneType},
 'homepage': {str, NoneType},
 'isPartOf_label': {list, NoneType, str},
 'maximumElevation': {NoneType},
 'minimumElevation': {NoneType},
 'name': {str, NoneType, list},
 'populationDensity': {list, NoneType, float},
 'populationTotal': {int, NoneType},
 'timeZone_label': {str, NoneType},
 'utcOffset': {int, str, NoneType, list},
 'wgs84_pos#lat': {float},
 'wgs84_pos#long': {float}}

#### 2. Fixing The Area

In [34]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with cities infobox data, audit it, come up with a
cleaning idea and then clean it up.

Since in the previous quiz you made a decision on which value to keep for the
"areaLand" field, you now know what has to be done.

Finish the function fix_area(). It will receive a string as an input, and it
has to return a float representing the value of the area or None.
You have to change the function fix_area. You can use extra functions if you
like, but changes to process_file will not be taken into account.
The rest of the code is just an example on how this function can be used.
"""
import codecs
import csv
import json
import pprint

CITIES = 'cities.csv'

def countZeros(val):
    count  = 0
    while val != 0:
        rem = val % 10
        if rem != 0:
            return count
        else:
            count  +=  1
            val = val // 10

def mostSignificant(areaList):
    
    if countZeros(areaList[0]) < countZeros(areaList[1]):
        return areaList[0]
    else:
        return areaList[1]


def fix_area(area):

    # YOUR CODE HERE
    res = None
    if area is None or area == '' or area == 'NULL' :
        return None
    elif area.startswith('{'):
        area = area.replace('{','').replace('}','').split('|')
        area = [float(val) for val in area]
        return mostSignificant(area)
    
    else :
        return float(area)

def process_file(filename):
    # CHANGES TO THIS FUNCTION WILL BE IGNORED WHEN YOU SUBMIT THE EXERCISE
    data = []

    with open(filename, "r") as f:
        reader = csv.DictReader(f)

        #skipping the extra metadata
        for i in range(3):
            next(reader)

        # processing file
        for line in reader:
            # calling your function to fix the area value
            if "areaLand" in line:
                line["areaLand"] = fix_area(line["areaLand"])
            data.append(line)

    return data


def test():
    data = process_file(CITIES)

    print("Printing three example results:")
    for n in range(9):
        pprint.pprint(data[n]["areaLand"])

if __name__ == "__main__":
    test()

Printing three example results:
None
None
None
None
None
None
101787000.0
31597900.0
55166700.0
