In [1]:
# importing modules needed for this section
import xml.etree.cElementTree as ET 
from collections import defaultdict
import re 
import os
import operator
from prettytable import PrettyTable as PT
from pprint import pprint

In [2]:
bigger_sample_file = "new_delhi_sample.osm" # sample file taken from the orignal

In [3]:
# checking the number tags named "tag" in the the sample file.
count= 0
for event, elem in ET.iterparse(bigger_sample_file):
    if elem.tag == "tag":
        count+=1
print(count)

82169


In [5]:
# Now counting the number of all the tags in the sample file
def count_tags(file):
    tags = {} # creating a dictionary for tag names and their count 
    for event, elem in ET.iterparse(file):
        tags[elem.tag] = tags.get(elem.tag, 0) + 1
    print(tags)
count_tags(bigger_sample_file)

{'tag': 82169, 'node': 340762, 'nd': 421392, 'way': 69426, 'member': 2795, 'relation': 619, 'osm': 1}


In [6]:
# I am going to check attributes for all top level tags
for event, elem in ET.iterparse(bigger_sample_file):
    if elem.tag == "node" or elem.tag == "way":
        for tag in elem.attrib:
                print(tag)
        break

changeset
id
lat
lon
timestamp
uid
user
version


In [4]:
# checking for unique users in the dataset
unique_usr = set()
for event, elem in ET.iterparse(bigger_sample_file):
    if "uid" in elem.attrib:
        unique_usr.add(elem.attrib["uid"])
print(len(unique_usr))

824


In [7]:
# elem.attrib is dict object
for event, elem in ET.iterparse(bigger_sample_file):
    if elem.tag == "tag":
        print(type(elem.attrib))
        break

<class 'dict'>


In [7]:
# Next I want to check what does the tag elements contain
for event, elem in ET.iterparse(bigger_sample_file):
    if elem.tag == "tag":
        print(elem.attrib)
        break

{'k': 'admin_level', 'v': '2'}


In [9]:
# Now I know that element "tag" consists attributes "k" and "v". 
# I am going to explore these attributes further.
# I am interested in addr:street and addr:postcode. 
# Lets count how many times these occur in the dataset.
count = 0
for event, elem in ET.iterparse(bigger_sample_file):
    if elem.tag == "tag":
        attrib = elem.attrib["k"]
        if attrib == "addr:street" or attrib == "addr:postcode":
            count += 1
            print(count ,attrib)
# This sample data contains 39 entries with street addresses and postal codes

1 addr:street
2 addr:street
3 addr:postcode
4 addr:street
5 addr:postcode
6 addr:street
7 addr:street
8 addr:street
9 addr:street
10 addr:street
11 addr:street
12 addr:postcode
13 addr:street
14 addr:street
15 addr:street
16 addr:postcode
17 addr:street
18 addr:postcode
19 addr:street
20 addr:postcode
21 addr:street
22 addr:postcode
23 addr:street
24 addr:postcode
25 addr:street
26 addr:street
27 addr:street
28 addr:street
29 addr:street
30 addr:street
31 addr:street
32 addr:street
33 addr:street
34 addr:street
35 addr:street
36 addr:street
37 addr:street
38 addr:street
39 addr:street
40 addr:postcode
41 addr:street
42 addr:postcode
43 addr:street
44 addr:postcode
45 addr:street
46 addr:postcode
47 addr:street
48 addr:postcode
49 addr:street
50 addr:postcode
51 addr:street
52 addr:postcode
53 addr:street
54 addr:postcode
55 addr:street
56 addr:postcode
57 addr:street
58 addr:postcode
59 addr:street
60 addr:postcode
61 addr:street
62 addr:postcode
63 addr:street
64 addr:postcode
65 addr

In [8]:
for event, elem in ET.iterparse(bigger_sample_file):
    if elem.tag == "node" or elem.tag == "way":
        for tag in elem.iter("tag"):
            print(tag.attrib["k"])
            break

admin_level
source
source
source
source
highway
atm
created_by
name
highway
highway
name
created_by
created_by
created_by
created_by
created_by
created_by
highway
barrier
created_by
created_by
created_by
highway
name
highway
amenity
highway
amenity
name
addr:street
name
highway
name
amenity
name
ref
amenity
name
highway
highway
highway
name
ref
name
name
highway
highway
amenity
highway
name
name
amenity
highway
name
name
highway
highway
highway
amenity
highway
shop
power
power
power
power
power
power
power
power
power
power
power
power
power
power
power
power
power
power
power
power
power
power
power
power
power
power
name
highway
name
name
highway
traffic_calming
highway
barrier
name
power
name
power
power
power
power
power
power
power
power
name
name
name
name
highway
highway
amenity
name
shop
name
name
power
power
name
amenity
traffic_calming
barrier
power
highway
name
power
highway
highway
name
name
name
barrier
amenity
name
shop
highway
amenity
name
name
amenity
highway
highway
na

In [11]:
# Now I am going to check if the tags contain any problem. I am going to use regular expressions to check if the 
# tag contain lower, lower colon or problem characters.
lower = re.compile(r'^([a-z]|_)*$') # tags that contain only lowercase letters and are valid
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$') # for tags with a colon in their names
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]') # for tags with problematic characters

In [12]:
# counting all the values in attribute "k" that match the regex pattern
lower_c = 0 # counter for lower cased values
lower_col_c = 0 # counter of lower cased values with colon
problemchar_c = 0 # counter for problematic values
other = 0 # counter for the rest of the values that fall in the regex patterns   

for event, elem in ET.iterparse(bigger_sample_file):
    if elem.tag == "tag":
        if lower.search(elem.attrib['k']):
            lower_c += 1
        elif lower_colon.search(elem.attrib['k']):
            lower_col_c += 1
        elif problemchars.search(elem.attrib['k']):
            problemchar_c += 1
        else:
            other += 1

print("lower: {} lower_colon: {} problemchar: {} other: {}".format(lower_c, lower_col_c, problemchar_c, other))

lower: 80541 lower_colon: 1590 problemchar: 0 other: 38


In [13]:
# I am now going to put the patterns in a dictionary.
counted_patterns = {"lower" : 0, "lower_colon" : 0, "problemchars" : 0, "others" : 0}
for event, elem in ET.iterparse(bigger_sample_file):
    if elem.tag == "tag":
        if lower.search(elem.attrib["k"]):
            counted_patterns["lower"] += 1
        elif lower_colon.search(elem.attrib["k"]):
            counted_patterns["lower_colon"] += 1
        elif problemchars.search(elem.attrib["k"]):
            counted_patterns["problemchars"] += 1
        else:
            counted_patterns["others"] += 1
print(counted_patterns)

{'lower': 80541, 'lower_colon': 1590, 'problemchars': 0, 'others': 38}


In [8]:
# now I will check for the values corresponding to the 'v' attribute. 
# I am also going to check what type of object it is.
for event, elem in ET.iterparse(sample_file):
    if elem.tag == "tag":
        if elem.attrib["k"] == "addr:street" or elem.attrib["k"] == "addr:postcode":
            print(elem.attrib["v"], type(elem.attrib["v"]))
            break

Old Delhi Gurgaon Road <class 'str'>


In [10]:
# now lets make a dictionary that will contain a list of postcodes and street addresses 
# extracted from those attributes
street_and_post = {"street" : [], "post_code" : []}
for event, elem in ET.iterparse(bigger_sample_file):  
    if elem.tag == "tag":
        if elem.attrib["k"] == "addr:street":
            street_and_post["street"].append(elem.attrib["v"])
        if elem.attrib["k"] == "addr:postcode":
            if len(elem.attrib["v"]) == 6:
                street_and_post["post_code"].append(elem.attrib["v"])
            else:
                print("Bad postcode {}".format(elem.attrib["v"]))
print()
print(street_and_post["street"])
print()
print(street_and_post["post_code"])

# There is a post code in the data that does not match the right post code pattern.

Bad postcode 2013010
Bad postcode 110 021
Bad postcode 1100002

['Block A1', 'Old Delhi Gurgaon Road', 'Aurangzeb Road', 'Block A1', 'Block A1', 'Block A1', 'Block A1', 'Block A1', 'Bhavani Kunj, Vasant Kunj', 'Block A', 'Sector 46', 'Block B', 'South City 2', 'South City II', 'South City II', 'Sector 17C', 'Sector 46', 'Sector 46', 'Sector 46', 'Sector 46', 'Sector 46', 'Sector 46', 'Sector 46', 'Sector 46', 'Sector 46', 'Sector 46', 'Sector 46', 'Sector 46', 'Sector 46', 'Sector 46', 'S1', 'S1', 'S1', 'S1', 'S1', 'S1', 'Palam Vihar', 'Palam Vihar', 'Palam Vihar', 'Palam Vihar', 'Palam Vihar', 'Palam Vihar', 'Palam Vihar', 'Sector 46', 'Sector 46', 'Sector 46', 'janakpuri', 'NIT', 'Connaught Circus', '649-6th Floor, Tower A Spaze iTechPark', 'palika bazar road', 'shastri nagar', 'delhi', 'Chuna Mandi, Paharganj', 'South Extension', 'Chattarpur Main Road', 'Shiv Arcade,aacharya niketan,mayor vihar ph-1', 'Vikas Marg', 'Ansari Nagar', 'DCE College', 'Asaf Ali Road', 'MAMURA Road', 'Hanu

In [15]:
# Pin codes in Delhi start with 11. However, in this dataset a large number of entries 
# contain invalid pin code data.
# My first attempt to fix these anomalies in the data is to compile simple regular expressions
# that checks the first two strings of the pin code. If a corrupt entry is found 
# the first two strings will be resubstituted with 11.
# The other regex validates a pin code only if it is 6 digits long. 
compiler = re.compile(r'^10') 
pin_validator = re.compile(r'^[1-9][0-9]{5}$') 

invalid_pin_codes = [] # making a container for invalid_pin_codes
counter = 0 # a counter variable to count the number of pin codes in this dataset
index = 0 # an index to keep track of bad pin codes
for event, elem in ET.iterparse(bigger_sample_file):
    if elem.tag == "tag":
        if elem.attrib["k"] == "addr:postcode":
            counter += 1
            match = compiler.search(elem.attrib["v"]) # check for pin codes that dont start with 11
            pin_match = pin_validator.search(elem.attrib["v"]) # check for 6 digit pin code
            if match:
                index += 1 # increament index if match is True
                print("-->", elem.attrib["v"], index, "Found", match.group())
            elif not pin_match:
                index += 1 # increament index if pin_match is False
                print("-->", elem.attrib["v"], index, "Found")
            else:
                print(elem.attrib["v"])
print()
print("-" * 50)
print("{} Number of pin codes\n{} Invalid pin codes\n{} Valid pin codes".format(counter, index,(counter - index)))
print('{:.2f}% {}'.format(((index/counter)* 100), "Bad entries"))

122001
110011
110070
122002
122018
122018
122018
122001
--> 100006 1 Found 10
--> 100006 2 Found 10
--> 100006 3 Found 10
--> 100006 4 Found 10
--> 100006 5 Found 10
--> 100006 6 Found 10
122001
122001
122001
122001
122001
122001
122001
110007
201307
110001
122018
110085
110043
110055
110074
110091
110029
110042
110042
110002
--> 2013010 7 Found
201301
110092
122002
110077
201014
110092
201308
110016
250101
110017
--> 110 021 8 Found
110009
110044
110001
122413
110055
110024
110092
110092
110092
110092
110053
201301
110032
201014
110016
110070
110001
110075
121004
201001
122011
110001
110070
201301
--> 1100002 9 Found
110075
110058
110052
110008
110044
201002
110096
110075
110075
110075
110096
201001
110091
110085
110042
110085
110085
201301
110067
110070
122001
110006
110057
122011
110029
122001
201306
201301
110096
201314
110025
110034
122102
110087
110087
110087
110087
110087
110087
110087
110087
110063
124507
124507
110087
110087
110087
110087
110087
110087
110087
110087
110087
110

In [20]:
# I am going to work on the street names now. My immediate approach for getting all the bad street names
# is to first make a list of all the expected street names.
# Then iterate through the tags where the "k" attribute is "addr:street"
# If I find a street name, I split it and take its last word.
# If the last words in the street names dont match the street names in the expected list 
# I will add them to a dictionary set.

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE) # regex that looks for the last word in a line


expected = ['Delhi', 'Street', 'Nagar', 'Sadan', 'Marg', 'Road', 'Avenue', 'Circle', 'Mayur'
            'Place', 'Sector', 'Vihar', 'Enclave', 'Block', 'Chowk', 'Colony', 'Mandir', 'Market', 'Place', 'Lane',
            'Estate', 'Bazaar', 'Kunj', 'Circus', 'Extension', 'College']

counter = 0 # counter to check total number of street names
street_dict = defaultdict(set) # container for the unexpected values
for event, elem in ET.iterparse(bigger_sample_file, events=("start", )):
    if elem.tag == "node" or elem.tag == "way":
        for tag in elem.iter("tag"):
            if tag.attrib["k"] == "addr:street":
                counter += 1 # counter increments if street name is found
                match = street_type_re.search(tag.attrib["v"]) # Last name of street names
                # are referenced to a variable match
                if match: # if a pattern matches successfully
                    street_name = match.group() # the matched pattern is .
                    if street_name not in expected:
                        street_dict[street_name].add(tag.attrib["v"])
pprint(dict(street_dict))
print()
print("{:.2f}% {}".format((len(dict(street_dict))/counter)*100, "Unexpected street names"))

{'17C': {'Sector 17C'},
 '2': {'South City 2'},
 '210': {'Road 210'},
 '3': {'Gharoli Road, Mayur Vihar Phase - 3'},
 '4': {'DLF Phase 4'},
 '44': {'Sector 44'},
 '46': {'Sector 46'},
 '56': {'Sector 56'},
 '6': {'Sector 6'},
 'A': {'Block A'},
 'A-6': {'A-6'},
 'A1': {'Block A1'},
 'B': {'Block B'},
 'Delhi-110089': {'G-5, Sector-16, Rohini, New Delhi-110089'},
 'Flats': {'Street E, Munirka DDA Flats'},
 'Gurgaon': {'Bilaspur Khurd, Gurgaon'},
 'Harola': {'Hanuman Market Harola'},
 'II': {'South City II'},
 'Janakpuri': {'C Block, Janakpuri'},
 'NIT': {'NIT'},
 'No.1': {'Block No.1'},
 'No.10': {'Block No.10'},
 'No.11': {'Block No.11'},
 'No.12': {'Block No.12'},
 'No.13': {'Block No.13'},
 'No.14': {'Block No.14'},
 'No.2': {'Block No.2'},
 'No.3': {'Block No.3', 'Street No.3'},
 'No.5': {'Block No.5'},
 'No.6': {'Block No.6'},
 'No.7': {'Block No.7'},
 'No.8': {'Block No.8'},
 'No.9': {'Block No.9'},
 'Noida,': {'C - 99, Sector - 4, Sector 4 Noida, Block C, Sector 4, Noida,'},
 'Ok

In [18]:
# creating a nested dictionary containing top level tag attributes 
# and their corresponding values.
# before I start populating the dictionary, first thing to do is to 
# create a list of expected tags
CREATED = ["version", "changeset", "timestamp", "user", "uid"] # list of expected tags
node_lst = [] # The dictionary will be appended to this list as it keeps getting updated
for event, elem in ET.iterparse(bigger_sample_file):
    if elem.tag == "node" or elem.tag == "way":
        nodes = {"created": {}, "type" : elem.tag} # dictionary that will store attributes and their values
        for tag in elem.attrib: # Iterate over top level element tags
            if tag in CREATED: # If tags match the CREATED list
                nodes["created"][tag] = elem.attrib[tag] # add the tag to the dictionary
                                                         # with elem.attrib[tag] as it value
        node_lst.append(nodes) # append dict to the list

In [19]:
# checking the node_lst
node_lst[0]

{'created': {'changeset': '46299189',
  'timestamp': '2017-02-22T08:42:55Z',
  'uid': '3029661',
  'user': 'saikabhi',
  'version': '53'},
 'type': 'node'}

In [20]:
# check node_lst again
node_lst[1]

{'created': {'changeset': '505778',
  'timestamp': '2007-09-23T02:41:01Z',
  'uid': '5456',
  'user': 'H_S_Rai',
  'version': '1'},
 'type': 'node'}

In [22]:
# expanding on to the previous code block
# most of the code remains the same except for the addition of "lat" and "lon" attrib
CREATED = ["version", "changeset", "timestamp", "user", "uid"] # list of expected tags
node_lst = [] # append an upadated dictionary to this list
for event, elem in ET.iterparse(bigger_sample_file):
    if elem.tag == "node" or elem.tag == "way":
        nodes = {"created": {}, "type" : elem.tag} # dictionary that will store attributes and their values
        if "lat" in elem.attrib and "lon" in elem.attrib: # if "lat" and "lon" in top level attrib
            # add a new key to the dict and assign floating "lat"/"lon" attributes as its val. 
                nodes["pos"] = [float(elem.attrib["lat"]), float(elem.attrib["lon"])]
        for tag in elem.attrib:
            if tag == "lat" or tag == "lon": # ignore "lat"/"lon" attributes if they are in tag
                continue
            if tag in CREATED: # update dictionary as usual if tag found in CREATED
                nodes["created"][tag] = elem.attrib[tag]
        
        node_lst.append(nodes) # append dict to the list        

In [23]:
# check the updated list
node_lst[:3]

[{'created': {'changeset': '46299189',
   'timestamp': '2017-02-22T08:42:55Z',
   'uid': '3029661',
   'user': 'saikabhi',
   'version': '53'},
  'pos': [28.6138967, 77.2159562],
  'type': 'node'},
 {'created': {'changeset': '505778',
   'timestamp': '2007-09-23T02:41:01Z',
   'uid': '5456',
   'user': 'H_S_Rai',
   'version': '1'},
  'pos': [28.5426036, 77.1680173],
  'type': 'node'},
 {'created': {'changeset': '505778',
   'timestamp': '2007-09-23T02:39:15Z',
   'uid': '5456',
   'user': 'H_S_Rai',
   'version': '1'},
  'pos': [28.5421814, 77.1648674],
  'type': 'node'}]

In [25]:
# completing the code from the previous block. Adding the missing parts to the code to prepare the data model needed 
# before injected it to a database.

problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
CREATED = ["version", "changeset", "timestamp", "user", "uid"] # list of expected tags
nodes_lst = [] # This list will append updated dictionary

for event, elem in ET.iterparse(bigger_sample_file):
    if elem.tag == "node" or elem.tag == "way":
        nodes = {"created": {}, "type" : elem.tag} # dictionary that will store attributes and their values
        if "lat" in elem.attrib and "lon" in elem.attrib:
                nodes["pos"] = [float(elem.attrib["lat"]), float(elem.attrib["lon"])]
        
        for tag in elem.attrib:
            if tag == "lat" or tag == "lon":
                continue
            if tag in CREATED:
                nodes["created"][tag] = elem.attrib[tag]
            else:
                nodes[tag] = elem.attrib[tag]
        
        for data in elem.iter("tag"):
            match = problemchars.search(data.attrib["k"])
            if match:
                continue
            
            elif data.attrib["k"].startswith("addr:"):
                address = data.attrib["k"].split(":", 2)
                nodes["address"] = {}
                if len(address) == 2:
                    nodes["address"][address[1]] = data.attrib["v"]
            else:
                nodes[data.attrib["k"]] = data.attrib["v"]
        
        node_ref = []        
        for nd_tag in elem.iter("nd"):
            node_ref.append(nd_tag.attrib["ref"])
        
        if len(node_ref) > 0:
            nodes["node_ref"] = node_ref
        
        nodes_lst.append(nodes)            

In [26]:
# checking if "address" key is in the dictionary
for keys in nodes_lst:
    if "address" in keys.keys():
        print(keys["address"])

{'housenumber': '38'}
{'postcode': '122001'}
{'housenumber': '12'}
{'housenumber': '40'}
{'housenumber': '6'}
{'housenumber': '19'}
{'housenumber': '30'}
{'housenumber': '48'}
{'housenumber': '63B'}
{'housenumber': '37'}
{'housenumber': '53'}
{'housenumber': '35'}
{'housenumber': '24'}
{'housenumber': '6'}
{'housenumber': '8'}
{'housenumber': '75'}
{'housenumber': '114'}
{'housenumber': '1425'}
{'housenumber': '42'}
{'housenumber': '512'}
{'housenumber': '21'}
{'housenumber': '18'}
{'housenumber': '15'}
{'housenumber': 'todo'}
{'housenumber': '1359'}
{'housenumber': '1342'}
{'housenumber': '1292'}
{'housenumber': '1446'}
{'housenumber': '1678'}
{'housenumber': '1475'}
{'housenumber': '1496'}
{'housenumber': '1847'}
{'housenumber': '1566'}
{'housenumber': '1650'}
{'housenumber': '1900'}
{'housenumber': '2036'}
{'housenumber': '2427'}
{'housenumber': '2571'}
{'housenumber': '771'}
{'housenumber': '652'}
{'housenumber': '600'}
{'housenumber': '630'}
{'housenumber': '600'}
{'housenumber': 

In [28]:
# checking for cuisines
for keys in nodes_lst:
    if "cuisine" in keys.keys():
        print(keys["cuisine"])

South Indian
indian
burger
vegetarian
burger
pizza
indian
indian
burger
regional
indian;chinese;tea
regional
pizza


In [29]:
# checking if amenity key is there in the dictionary
for keys in nodes_lst:
    if "amenity" in keys.keys():
        print(keys["amenity"])

bank
bank
pharmacy
fast_food
cinema
fuel
pharmacy
atm
bank
atm
fuel
atm
fuel
fuel
bank
parking
fuel
school
police
post_box
fast_food
restaurant
parking
hospital
pharmacy
restaurant
bank
hospital
parking
bank
fuel
fuel
atm
atm
school
police
school
parking
fuel
bank
hospital
doctors
atm
hospital
bank
toilets
toilets
hospital
parking
restaurant
place_of_worship
restaurant
embassy
embassy
embassy
embassy
embassy
embassy
police
restaurant
school
cinema
school
atm
parking
post_box
cinema
atm
hospital
pharmacy
bar
restaurant
bank
parking
school
place_of_worship
restaurant
marketplace
cafe
post_office
car_rental
bank
atm
atm
pharmacy
bank
doctors
bank
fast_food
place_of_worship
place_of_worship
cinema
school
atm
fuel
atm
bank
school
cinema
restaurant
marketplace
restaurant
bar
fountain
fountain
atm
place_of_worship
restaurant
restaurant
bank
school
restaurant
school
hospital
school
school
restaurant
marketplace
place_of_worship
atm
fast_food
bank
cafe
police
fuel
bank
university
school
fuel
co

In [30]:
for keys in nodes_lst:
    if "name" in keys.keys():
        print(keys["name"])

New Delhi
ICICI, SBI, Citibank,
Dwarka Sector 13
Dwarka Mor
Standard Chartered Bank
PVR Priya
Indian Oil
Block B Market
IFFCO Chowk
Entry 8
HSBC
Unitech Adventure Island
Entry 14
State Bank of India
HP Petrol Pump
Petrol Pump Indian Oil
Indian Oil
Axis Bank
Sulpantpur Lake Parking
Infocity Phase 2
Mamura Chowk
CR Model Public School
Jharsa
Malibu Town
Qutab Institutional Area
Sushant Lok Police Station
Chander Nagar
Rajendra Place
Tilak Nagar
Janakpuri West
Claridges Hotel
DLF Mega Mall
Om Sweets
Naveidyam
0062
Bharat Sanchar Bhuvan - BSNL HQ
Ishwar Nagar
Artmemis
Nirula's
HDFC Bank
Gobind Hospital
Sector-14 Market
State Bank of India
Sarup Nagar
Indian Oil
Rai
Fadil Pur
Feel Good
Khidki Village
MGF Metropolitan Mall
Citibank
Secondary School
Tree British School
Moolchand Crossing
Patel Chowk
New Delhi
Jawaharlal Nehru Stadium
Shipra Police station
Cedar Crest
Ambedkar Nagar
Father Agnel School
Base Hospital Delhi Cantt
Shahdara
Dilshad Garden
Rohini East
DDA Booster Pump Station
Block

In [32]:
# checking for node_ref
for keys in nodes_lst:
    if "node_ref" in keys.keys():
        print(keys["node_ref"])
        break

['245765418', '2753154964', '2769850480']


In [36]:
# functions that deal with post codes
invalid_pins = []
def pin_validator(pin):
    """
    Return a pin if pin code is 6 digit valid pin. Else append invalid pin codes to a list.
    """
    pin_validator = re.compile(r'^[1-9][0-9]{5}$') 
    match = pin_validator.search(pin)
    if match:
        correct_pin = pin

        return correct_pin
    
    invalid_pins.append(pin)
    #return pin

def simple_pin_code_fix(pin_code):
    """
    Return a corrected pin code if a wrong pin code is found. Else return just the pin code.
    This is not a generalized solution. It fixes only the wrong pin codes of this dataset.
    """
    pin = list(pin_code)
    
    if len(pin) > 6 and pin_code.count("0") > 3:
        pin.remove("0")
        correct_pin = "".join(pin)

        return correct_pin

    elif len(pin) > 6 and pin_code.count(" ") > 0:
        pin.remove(" ")
        correct_pin = "".join(pin)

        return correct_pin

    elif pin_code.startswith("10", 0, 2):
        correct_pin = re.sub(r"^10", "11", pin_code)

        return correct_pin
    
    elif pin_code.endswith("0", -1) and pin_code.startswith("2", 0) and len(pin) > 6:
        del pin[-1]
        correct_pin = "".join(pin)
        
        return correct_pin
    
    return pin_code

def is_pin_code(elem):
    """Return True if the value of two operands are equal. Return False otherwise."""
    return elem.attrib["k"] == "addr:postcode"

In [35]:
# first pass at correcting pin codes
invalid_pin_list = []
for event, elem in ET.iterparse(bigger_sample_file):
    if elem.tag == "tag":
        if elem.attrib["k"] == "addr:postcode":
            pin = pin_validator(simple_postal_code_fix(elem.attrib["v"])) # check for 6 digit pin code
            if pin.startswith("Invalid pin code:"):
                invalid_pin_list.append(pin)
            else:
                print(pin)

print()
print(invalid_pin_list)
# One invalid entry
# I'll make a second pass at fixing this.

122001
110011
110070
122002
122018
122018
122018
122001
110006
110006
110006
110006
110006
110006
122001
122001
122001
122001
122001
122001
122001
110007
201307
110001
122018
110085
110043
110055
110074
110091
110029
110042
110042
110002
201301
110092
122002
110077
201014
110092
201308
110016
250101
110017
110021
110009
110044
110001
122413
110055
110024
110092
110092
110092
110092
110053
201301
110032
201014
110016
110070
110001
110075
121004
201001
122011
110001
110070
201301
110002
110075
110058
110052
110008
110044
201002
110096
110075
110075
110075
110096
201001
110091
110085
110042
110085
110085
201301
110067
110070
122001
110006
110057
122011
110029
122001
201306
201301
110096
201314
110025
110034
122102
110087
110087
110087
110087
110087
110087
110087
110087
110063
124507
124507
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087
110087

In [75]:
# Now that I know how many unexpected street names are there in the data I can 
# prepare a better data structure to contain them.
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE) # regex that looks for the last word in a line

# creating a list of expected street names
expected = ['Delhi', 'Street', 'Nagar', 'Sadan', 'Marg', 'Road', 'Avenue', 'Circle', 'Mayur'
            'Place', 'Sector', 'Vihar', 'Enclave', 'Block', 'Chowk', 'Colony', 'Mandir', 'Market', 'Place', 'Lane',
            'Estate', 'Bazaar', 'Kunj', 'Circus', 'Extension', 'College']

# creating a dictionary to map problematic names with corrected names
mapping = {"delhi" : "Delhi",
           "road" : "Road",
           "block" : "Block",
           "sector": "Sector",
           "nagar" : "Nagar",
           "NAGAR" : "Nagar",
           "sadan" : "Sadan",
           "marg" : "Marg",
           "ave" : "Avenue",
           "ave." : "Avenue",
           "vihar" : "Vihar",
           "place" : "Place",
           "colony" : "Colony",
           "chowk" : "Chowk",
           "circle" : "Circle",
           "VIJAY" : "Vijay",
           "II" : "2",
           "mayor" : "Mayur",
           "–" : "",
           "north" : "North",
           "flat" : "Flat",
          "janakpuri" : "Janakpuri",
          "footpath" : "Footpath",
          "GK" : "Greater Kailash",
          "-" : " ",
          "bazar" : "Bazar",
          "Pahargan" : "Paharganj"}

counter = 0 # counter to check total number of street names
street_dict = defaultdict(set) # container for the unexpected values

for event, elem in ET.iterparse(bigger_sample_file, events=("start", )):
    if elem.tag == "node" or elem.tag == "way":
        
        for tag in elem.iter("tag"):
            if tag.attrib["k"] == "addr:street":
                counter += 1 # counter increments if street name is found
                match = street_type_re.search(tag.attrib["v"]) # Last name of street names
                # are referenced to a variable match
                if match: # if a pattern matches successfully
                    street_name = match.group() # group the matched patterns
                    if street_name not in expected: # if the group matches expected street names
                        street_dict[street_name].add(tag.attrib["v"]) # append them

pprint(dict(street_dict))
print()
print("{:.2f}% {}".format((len(dict(street_dict))/counter)*100, "Unexpected street names the sample dataset"))
print()
# To actually correct the unexpected street names
# I'll iterate over the key and values of the street_dict by calling the .item() method on it.
for k, v in street_dict.items():
    for name in v: # As v contain a set. I'll iterate over the items in the set.  
        for key in mapping.keys(): # I'll also iterate over the keys of the mapping dict
            match = re.search(key, name) # and then search for a pattern between the name and keys
            if match: # if a match is found 
                print("Match Found:", name, "*", match.group())
                change_name = re.sub(key, mapping[key], name) # I'll sub. the bad names with the good ones in mapping dict
                print("Correction:", name, "-->", change_name.title(),"\n")

{'17C': {'Sector 17C'},
 '2': {'South City 2'},
 '210': {'Road 210'},
 '3': {'Gharoli Road, Mayur Vihar Phase - 3'},
 '4': {'DLF Phase 4'},
 '44': {'Sector 44'},
 '46': {'Sector 46'},
 '56': {'Sector 56'},
 '6': {'Sector 6'},
 'A': {'Block A'},
 'A-6': {'A-6'},
 'A1': {'Block A1'},
 'B': {'Block B'},
 'Delhi-110089': {'G-5, Sector-16, Rohini, New Delhi-110089'},
 'Flats': {'Street E, Munirka DDA Flats'},
 'Gurgaon': {'Bilaspur Khurd, Gurgaon'},
 'Harola': {'Hanuman Market Harola'},
 'II': {'South City II'},
 'Janakpuri': {'C Block, Janakpuri'},
 'NIT': {'NIT'},
 'No.1': {'Block No.1'},
 'No.10': {'Block No.10'},
 'No.11': {'Block No.11'},
 'No.12': {'Block No.12'},
 'No.13': {'Block No.13'},
 'No.14': {'Block No.14'},
 'No.2': {'Block No.2'},
 'No.3': {'Street No.3', 'Block No.3'},
 'No.5': {'Block No.5'},
 'No.6': {'Block No.6'},
 'No.7': {'Block No.7'},
 'No.8': {'Block No.8'},
 'No.9': {'Block No.9'},
 'Noida,': {'C - 99, Sector - 4, Sector 4 Noida, Block C, Sector 4, Noida,'},
 'Ok

In [37]:
# funtions that deal with street names
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

def update_street_name(name, mapping):
    """
    Return correct name if name is found in mapping dictionary. 
    Return name otherwise.
    """
    for key in mapping.keys():
        match = re.search(key, name) 
        if match:
            correct_name = re.sub(key, mapping[key], name)
            
            return correct_name.title()
        
    return name
                
def audit_street_type(street_name):
    """
    Return updated street name if name is not in expected street name list. 
    Return street name otherwise.
    """
    match = street_type_re.search(street_name)
    if match:
        street_type = match.group()
        if street_type not in expected:
            return update_street_name(street_name, mapping)
    
    return street_name

def is_street_name(elem):
    """Return True if the value of two operands are equal. Return False otherwise."""
    return elem.attrib['k'] == "addr:street"


In [84]:
# phone number with problems
for event, elem in ET.iterparse(bigger_sample_file):
    if elem.tag == "node" or elem.tag == "way":
        for tag in elem.iter("tag"):
            if "phone" in tag.attrib["k"]:
                print("yes:", tag.attrib["k"], tag.attrib["v"])

yes: phone +91 11 3955 5000
yes: phone 91-11-2687-6564
yes: phone +91-120-3830000; +91-11-23582812 / 4658
yes: phone +911126801804
yes: phone +91 11 3218206
yes: phone 9990291418
yes: phone 08527450737
yes: phone +91 9958080618
yes: phone +918130770731
yes: phone 9910536692
yes: phone 911146082303
yes: phone 08287872020
yes: phone +911141677410
yes: phone +91 11 4995 9500
yes: phone +91 9172313994
yes: phone +911140047000
yes: phone +919911119039
yes: phone 0120 252 0242
yes: phone 8586986389
yes: phone +91 11 4309 0000
yes: phone +91 95407 59909


In [38]:
# functions to fix phone numbers
def is_phone_number(elem):
    """
    Return True if the value of the operands are equal. Return False otherwise.
    """
    return elem.attrib["k"] == "phone"

def phone_fixer(phone):
    """
    Return a phone number after striping its whitespaces or non-digit characters except "+" (plus symbol). 
    """
    phone_num = phone
    join_num = "".join(phone_num.strip().split())
    match_num = re.search(r"[^\d+]", join_num)
    if match_num:
        fix_num = re.sub(match_num.group(), "", join_num)
        
        return fix_num
    
    return join_num    

In [39]:
# code with all the pieces together. This code should now clean the street names, pin codes and phone numbers.
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ['Delhi', 'Street', 'Nagar', 'Sadan', 'Marg', 'Road', 'Avenue', 'Circle', 'Mayur'
            'Place', 'Sector', 'Vihar', 'Enclave', 'Block', 'Chowk', 'Colony', 'Mandir', 'Market', 'Place', 'Lane',
            'Estate', 'Bazaar', 'Kunj', 'Circus', 'Extension', 'College', "Paharganj", "Janakpuri", "Flat", "Palika", 
            'Gate']

CREATED = ["version", "changeset", "timestamp", "user", "uid"] # list of expected tags

mapping = {"delhi" : "Delhi",
           "road" : "Road",
           "block" : "Block",
           "sector": "Sector",
           "nagar" : "Nagar",
           "NAGAR" : "Nagar",
           "sadan" : "Sadan",
           "marg" : "Marg",
           "ave" : "Avenue",
           "ave." : "Avenue",
           "vihar" : "Vihar",
           "place" : "Place",
           "colony" : "Colony",
           "chowk" : "Chowk",
           "circle" : "Circle",
           "VIJAY" : "Vijay",
           "II" : "2",
           "–" : "",
           "north" : "North",
           "flat" : "Flat",
          "janakpuri" : "Janakpuri",
          "footpath" : "Footpath",
          "GK" : "Greater Kailash",
          "-" : " ",
          "bazar" : "Bazar",
          "Pahargan" : "Paharganj",}

street_dict = defaultdict(set)

nodes_lst = [] # This list will append updated dictionary

for event, elem in ET.iterparse(bigger_sample_file):
    if elem.tag == "node" or elem.tag == "way":
        nodes = {"created": {}, "type" : elem.tag} # dictionary that will store attributes and their values
        if "lat" and "lon" in elem.attrib:
                nodes["pos"] = [float(elem.attrib["lat"]), float(elem.attrib["lon"])]
        
        for tag in elem.attrib:
            if tag == "lat" or tag == "lon":
                continue
            elif tag in CREATED:
                nodes["created"][tag] = elem.attrib[tag]
            else:
                nodes[tag] = elem.attrib[tag]
        
        for data in elem.iter("tag"):
            match = problemchars.search(data.attrib["k"])
            if match:
                continue
            elif data.attrib["k"].startswith("addr:"):
                address = data.attrib["k"].split(":", 2)  
                if len(address) == 2:
                    if "address" not in nodes:
                        nodes["address"] = {}
                    elif is_street_name(data):
                        nodes["address"][address[1]] = audit_street_type(data.attrib["v"])
                    elif is_pin_code(data):
                        nodes["address"][address[1]] = pin_validator(simple_pin_code_fix(data.attrib["v"]))
            else:
                if is_phone_number(data):
                    nodes[data.attrib["k"]] = phone_fixer(data.attrib["v"])
                else:
                    nodes[data.attrib["k"]] = data.attrib["v"]
        
        node_ref = []        
        for nd_tag in elem.iter("nd"):
            node_ref.append(nd_tag.attrib["ref"])
        
        if len(node_ref) > 0:
            nodes["node_ref"] = node_ref
        
        nodes_lst.append(nodes)            

In [40]:
# total number of entries
len(nodes_lst)

410188

In [42]:
# fixed phone numbers
for key in nodes_lst:
    if "phone" in key.keys():
        print(key["phone"])

+911139555000
911126876564
+911203830000;+911123582812/4658
+911126801804
+91113218206
9990291418
08527450737
+919958080618
+918130770731
9910536692
911146082303
08287872020
+911141677410
+911149959500
+919172313994
+911140047000
+919911119039
01202520242
8586986389
+911143090000
+919540759909


In [43]:
# fixed street names and postal address
for key in nodes_lst:
    if "address" in key.keys():
        for k, v in key["address"].items():
            print(k, v)
            break

street Old Delhi Gurgaon Road
street Aurangzeb Road
street Bhavani Kunj, Vasant Kunj
street Sector 46
postcode 122002
street South City 2
street South City 2
street South City 2
street Sector 17C
street Sector 46
street Sector 46
street Sector 46
street Sector 46
street Sector 46
street Sector 46
street Sector 46
street Sector 46
street Sector 46
street Sector 46
street Sector 46
street Sector 46
street Sector 46
street Sector 46
street S1
street S1
street S1
street S1
street S1
street S1
street Palam Vihar
street Palam Vihar
street Palam Vihar
street Palam Vihar
street Palam Vihar
street Palam Vihar
street Palam Vihar
postcode 110007
street Sector 46
street Sector 46
street Sector 46
street NIT
street Connaught Circus
street 649 6Th Floor, Tower A Spaze Itechpark
street Chuna Mandi, Paharganj
street South Extension
postcode 110074
street Shiv Arcade,Aacharya Niketan,Mayor Vihar Ph-1
street Ansari Nagar
postcode 110042
postcode 110042
street Asaf Ali Road
street MAMURA Road
street Hanu