The primary goal of this analysis is to determine the quality of OpenStreetMap address data in the Bergen, Norway region.

In [175]:
#importing classes from display and pretty print modules
from pprint import pprint
from IPython.display import HTML
from IPython.display import display

In [157]:
#Setting up MongoDB connection
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017")
db = client.osm
#Creating db.bergen as a variable for the sake of brewity
bergen = db.bergen

In [173]:
display_html('<b>Count of documents in database:</b>')

In [170]:
#Getting an initial overview of the data
display(HTML('<b>Count of documents in database:</b>'),bergen.count())
display(HTML('<b>First record:</b>'))
pprint(bergen.find_one())

681172

{'_id': ObjectId('58664d22fff9a2e7e4db1a23'),
 'created': {'changeset': '6007582',
             'timestamp': '2010-10-10T22:29:54Z',
             'uid': '114230',
             'user': 'danerikk',
             'version': '2'},
 'id': '358075',
 'pos': [60.5296371, 5.250335],
 'type': 'node'}


In [13]:
#Creating indexes

from pymongo import ASCENDING

bergen.create_index([('address', ASCENDING),('address.street', ASCENDING),('address.housenumber', ASCENDING)])


'address_1_address.street_1_address.housenumber_1'

In [14]:
#Getting count of documents with address field

address_query = { 'address' : {'$exists' : True } }
address_documents = bergen.find(address_query)
address_count = address_documents.count()

display(HTML('<b>Number of addresses in dataset:</b>'),address_count)


84625

In [15]:
#EXPERIMENTAL, WILL BE REMOVED
tmp_agg = db.bergen.aggregate([
         { "$group" : { "_id" : "$address.street",
                       "mset" : {
                         "$addToSet" : "$address.house_number"
                 } } },
         { "$unwind" : "$mset"},
         { "$group" : { "_id" : "$_id",
                       "count" : { "$sum" : 1} } },
         { "$sort" : { "count" : -1 } },
         { "$limit" : 10 } ] )

for doc in tmp_agg:
    print(doc)

In [177]:
#Getting counts for streetnames and addresses

aggregated = bergen.aggregate([  
        {'$match' : {'address': {'$exists' : True } } },
        { "$group" : { 
                "_id" : "$address.street","count" : { "$sum" : 1} } }
    ])

household_count = 0
unique_street_count = 0
addresses_on_street = {}

for doc in aggregated:
    household_count += doc['count']
    unique_street_count += 1
    
    addresses_on_street[doc['_id']] = doc['count']

print("total addresses in Bergen:", household_count)
print("number of streetnames:", unique_street_count)

total addresses in Bergen: 84625
number of streetnames: 2237


According to January 2016 data from Statistics Norway (SSB), there are 134,328 households in Bergen. The data used by Statistics Norway is collected from the National Registry, and the data include unit numbers for minimum 95% of the addresses where such a number exists. The available OSM data does not contain unit numbers. Several addresses in Bergen contain multiple home units, and although the OSM data also contains non-household addresses (businesses, public institutions etc.) the number of addresses in the OSM data seems reasonable.

In [17]:
#EXPERIMENTAL, WILL BE REMOVED
streetnames_sorted_dict = dict(sorted(addresses_on_street.items(), key=lambda x: x[1], reverse=True))

for rec in streetnames_sorted_dict:
    if streetnames_sorted_dict[rec] == 184:
        print (rec,streetnames_sorted_dict[rec],type(rec))

None 184 <class 'NoneType'>
Skjenlien 184 <class 'str'>
Olsvikmarken 184 <class 'str'>
Grimstadvegen 184 <class 'str'>


In [178]:
#Taking a look at the streets with the most addresses

from operator import itemgetter

streetnames_sorted_dict = dict(sorted(addresses_on_street.items(), key=lambda x: x[1], reverse=True)[:10])
streetnames_sorted_list = sorted(addresses_on_street.items(), key=lambda x: x[1], reverse=True)


print("Streets with most addresses on them:")

for street,count in streetnames_sorted_list[0:10]:
    print(street,count)

Streets with most addresses on them:
Myrdalskogen 442
Askvegen 397
Søråshøgda 377
Kringlebotn 304
Flaktveitvegen 293
Stongafjellsvegen 289
Hjellestadvegen 277
Hetlevikåsen 276
Langarinden 273
Nipedalen 250


### Question for 1:1 
Elaborate on this? Keep/remove?

In [183]:
#Checking for potential duplicate data due to misspelled street names

import difflib
from fuzzywuzzy import fuzz

def fuzzy_streets(ratio,house_count):
    
    fuzzy_matches = list()
    compare_count = 0
    
    for k1 in streetnames_sorted_list:

        if k1[0] is None:
            print("Addresses without street name:",k1[1])

        #Only comparing street names with less addresses than house_count
        elif k1[1] <= house_count:
            
            compare_count += 1

            for k2 in streetnames_sorted_list:

                if k2[0] is None:
                    pass

                elif k2[0] == k1[0]:
                    pass

                else:                    
                    
                    fuzz_ratio = fuzz.ratio(k1[0],k2[0])
                    
                    if fuzz_ratio >= ratio:
                        fuzzy_matches.append({k1: k2,"fuzz ratio": fuzz_ratio})

    print("Number of street names compared: {0} of {1}".format(compare_count,len(streetnames_sorted_list)))
    
    return fuzzy_matches

In [182]:
#Lower than 90 fuzzy ratio gives too many false positives. Same goes for higher than 10 addresses on the street.
potential_misspellings = fuzzy_streets(92,10)

Addresses without street name: 184
Number of street names compared: 532 of 2237


### Question for 1:1
Should I take a closer look at the addresses without street name?

In [221]:
#Printing out the potential misspellings

#Adding index to make it easier to sort out the items I need to investigate further
for index, item in enumerate(potential_misspellings):
    print(index,item)

0 {('Dreggsallmenningen', 10): ('Dreggsallmenning', 1), 'fuzz ratio': 94}
1 {('Herman Foss’ gate', 9): ("Herman Foss' gate", 1), 'fuzz ratio': 94}
2 {('Vilhelm Bjerknesvei', 8): ('Vilhelm Bjerknes’ vei', 111), 'fuzz ratio': 95}
3 {('Solhaugveien', 7): ('Solhaugvegen', 22), 'fuzz ratio': 92}
4 {('Flyplassveien', 7): ('Flyplassvegen', 27), 'fuzz ratio': 92}
5 {('Haakon Sheteligs plass', 6): ('Haakon Shetelings plass', 2), 'fuzz ratio': 98}
6 {('Austevågen', 5): ('Austrevågen', 21), 'fuzz ratio': 95}
7 {('Vestre Mulelvsmauet', 4): ('Østre Mulelvsmauet', 3), 'fuzz ratio': 92}
8 {('Østre Mulelvsmauet', 3): ('Vestre Mulelvsmauet', 4), 'fuzz ratio': 92}
9 {('Travparkvegen', 3): ('Travparkveien', 1), 'fuzz ratio': 92}
10 {('Nesttunveien', 2): ('Nesttunvegen', 59), 'fuzz ratio': 92}
11 {('C.Sundtsgate', 2): ('C. Sundts gate', 53), 'fuzz ratio': 92}
12 {('Haakon Shetelings plass', 2): ('Haakon Sheteligs plass', 6), 'fuzz ratio': 98}
13 {'fuzz ratio': 97, ('Torgalmenningen', 2): ('Torgallmenninge

In [218]:
#Ensuring corrected street names in cleaning script are in fact corrected in the database
for street,count in streetnames_sorted_list:
    
    if street is None:
        pass
    
    elif ('Thormøhlens' or 'Smøråshøgda 9' or 'Laguneveien 1') in street:
        #expecting 1 result
        print(street,count)

Thormøhlens gate 47


"Thormøhlens gate 47" is what I expect to get returned from the cell above, and nothing else. When I ran this previously I discovered that the street names had not been properly corrected, so I had to go back to the pre-import wrangling file and make the necessary edits.

### Question for 1:1
Should I remove the text explanation above? what about the import test?

In [222]:
#INCOMPLETE. I will use this to filter out what I need to take a closer look at
true_duplicates = []
investigate_further = []

# for index, item in enumerate(potential_misspellings):
#     if item[0] in (6,17,1,)

Above I have performed some QA on the street names from the Bergen OSM dataset. I have taken a closer look at the street names with less than 10 house numbers, and I have compared those street with the other street names to spot potential misspelled and duplicate street names.

I have manually reviewed the returned list of (fuzzy) matched street names, and I have added what I consider true duplicates (based on local knowledge) to a new list, `true_duplicates`. Some of the matched street names require further investigation, and I have therefore created a separate list for those items, called `investigate_further`.

In [25]:
#Creating functions for printing individual address search results

#without postal code
def search_one_address(street, housenumber):
    
    housenumber = str(housenumber)
    
    query = { 'address.street': street, 'address.housenumber': housenumber }

    for doc in bergen.find(query):
        pprint(doc)
    
    return
#with postal code
def search_one_address_with_postal_code(street, housenumber):
    query = { 'address.street': street, 'address.housenumber': housenumber }

    for doc in bergen.find(query):
        pprint(doc)
    
    return

In [26]:
#Searching for duplicates of Laguneveien 1

search_one_address('Laguneveien',1)

{'_id': ObjectId('58664d29fff9a2e7e4dd8f6b'),
 'address': {'city': 'Rådal',
             'housenumber': '1',
             'postcode': '5239',
             'street': 'Laguneveien'},
 'created': {'changeset': '26026343',
             'timestamp': '2014-10-12T14:10:49Z',
             'uid': '103253',
             'user': 'gormur',
             'version': '1'},
 'id': '3125931672',
 'pos': [60.2968652, 5.3311546],
 'type': 'node'}
{'_id': ObjectId('58664d39fff9a2e7e4e24ba1'),
 'address': {'city': 'Rådal',
             'floor': '1',
             'housenumber': '1',
             'postcode': '5239',
             'street': 'Laguneveien'},
 'contact': {'facebook': 'https://www.facebook.com/arnasomogstrikkas'},
 'created': {'changeset': '36459796',
             'timestamp': '2016-01-09T09:12:57Z',
             'uid': '1965308',
             'user': 'FredrikLindseth',
             'version': '1'},
 'id': '3935489347',
 'pos': [60.2968112, 5.3317375],
 'type': 'node'}
{'_id': ObjectId('58664d3ffff

### Question for 1:1
I need help interpreting why there are multiple documents for the same address. Are they duplicates, or is this ok? What about the node references?

In [27]:
pipeline = [
    { '$match': { 'address.street': 'Laguneveien' } },
    { '$group': { 
            '_id': '$address.postcode', 'count' : {'$sum': 1 } 
        } 
    },
    {'$sort' : {'count' : -1} }
    
]

for doc in bergen.aggregate(pipeline):
    pprint(doc)

{'_id': '5239', 'count': 19}
{'_id': '5235', 'count': 1}


In [225]:
query = { 'address.street': 'Laguneveien', 'address.housenumber': '1' }

for doc in bergen.find(query):
    pprint(doc)

{'_id': ObjectId('58664d29fff9a2e7e4dd8f6b'),
 'address': {'city': 'Rådal',
             'housenumber': '1',
             'postcode': '5239',
             'street': 'Laguneveien'},
 'created': {'changeset': '26026343',
             'timestamp': '2014-10-12T14:10:49Z',
             'uid': '103253',
             'user': 'gormur',
             'version': '1'},
 'id': '3125931672',
 'pos': [60.2968652, 5.3311546],
 'type': 'node'}
{'_id': ObjectId('58664d39fff9a2e7e4e24ba1'),
 'address': {'city': 'Rådal',
             'floor': '1',
             'housenumber': '1',
             'postcode': '5239',
             'street': 'Laguneveien'},
 'contact': {'facebook': 'https://www.facebook.com/arnasomogstrikkas'},
 'created': {'changeset': '36459796',
             'timestamp': '2016-01-09T09:12:57Z',
             'uid': '1965308',
             'user': 'FredrikLindseth',
             'version': '1'},
 'id': '3935489347',
 'pos': [60.2968112, 5.3317375],
 'type': 'node'}
{'_id': ObjectId('58664d3ffff

In [226]:
query = { 'address.street': 'Laguneveien', 'address.postcode': '5235', 'address.housenumber': 1 }

for doc in bergen.find(query):
    pprint(doc)

{'_id': ObjectId('58664d27fff9a2e7e4dccb33'),
 'address': {'city': 'Rådal',
             'housenumber': 1,
             'postcode': '5235',
             'street': 'Laguneveien'},
 'created': {'changeset': '39294271',
             'timestamp': '2016-05-13T14:59:10Z',
             'uid': '1965308',
             'user': 'FredrikLindseth',
             'version': '6'},
 'id': '1652908136',
 'pos': [60.2962144, 5.3301382],
 'type': 'node'}


According to The Norwegian Mapping Authority, the correct postal code for Laguneveien is 5239. The 5235 document is incorrect.

In [29]:
#Finding duplicate addresses

pipeline = [
    { '$group': { 
            '_id': { 
                'street': '$address.street', 'housenumber': '$address.housenumber' 
            }, 
            'count' : {'$sum': 1 } 
        } 
    },
    { '$match': {'count': {'$gt': 1} } },
    {'$sort' : {'count' : -1} } ]

duplicate_addresses = []

for doc in bergen.aggregate(pipeline):
    duplicate_addresses.append(doc)

print("Number of potential duplicate addresses:", len(duplicate_addresses))

Number of potential duplicate addresses: 901


In [227]:
#Printing out top 10 addresses with duplicates
pprint(duplicate_addresses[1:10])

[{'_id': {'housenumber': '66', 'street': 'Kanalveien'}, 'count': 13},
 {'_id': {'street': 'Lyngmarka'}, 'count': 12},
 {'_id': {'housenumber': '6', 'street': 'Valkendorfsgaten'}, 'count': 12},
 {'_id': {'housenumber': '37', 'street': 'Kalfarveien'}, 'count': 11},
 {'_id': {'housenumber': '64', 'street': 'Kanalveien'}, 'count': 10},
 {'_id': {'housenumber': '10', 'street': 'Sandslihaugen'}, 'count': 10},
 {'_id': {'housenumber': '3', 'street': 'Lilandsveien'}, 'count': 10},
 {'_id': {'housenumber': '62', 'street': 'Kanalveien'}, 'count': 9},
 {'_id': {'housenumber': '2', 'street': 'Vetrlidsallmenningen'}, 'count': 8}]


In [31]:
#Looking at all documents with the top duplicate address
search_one_address('Kanalveien','66')

{'_id': ObjectId('58664d40fff9a2e7e4e43c2e'),
 'address': {'city': 'Bergen',
             'housenumber': '66',
             'postcode': '5068',
             'street': 'Kanalveien'},
 'created': {'changeset': '40278280',
             'timestamp': '2016-06-25T09:11:13Z',
             'uid': '1965308',
             'user': 'FredrikLindseth',
             'version': '1'},
 'id': '4264196717',
 'pos': [60.3621861, 5.3469652],
 'type': 'node'}
{'_id': ObjectId('58664d40fff9a2e7e4e43c2f'),
 'address': {'city': 'Bergen',
             'housenumber': '66',
             'postcode': '5068',
             'street': 'Kanalveien'},
 'created': {'changeset': '40278280',
             'timestamp': '2016-06-25T09:11:13Z',
             'uid': '1965308',
             'user': 'FredrikLindseth',
             'version': '1'},
 'id': '4264196718',
 'pos': [60.3620529, 5.3469736],
 'type': 'node'}
{'_id': ObjectId('58664d40fff9a2e7e4e43c32'),
 'address': {'city': 'Bergen',
             'housenumber': '66',
     

In [32]:
#INCOMPLETE looking at one of the nodes

bergen.find_one({'id': '4264197029'})

{'_id': ObjectId('58664d40fff9a2e7e4e43d06'),
 'created': {'changeset': '40278280',
  'timestamp': '2016-06-25T09:11:20Z',
  'uid': '1965308',
  'user': 'FredrikLindseth',
  'version': '1'},
 'id': '4264197029',
 'pos': [60.3620325, 5.3469011],
 'type': 'node'}

In [39]:
#Converting list for dataframe usage
to_df = []

for dic in duplicate_addresses:
    dictionary = dict()
    
#     for key,val in dic.items():
#         dictionary[key] = val
        
#         if key == '_id':
    for key,val in dic['_id'].items():
        dictionary[key] = val
        
    dictionary['count'] = dic['count']
                
        
    to_df.append(dictionary)
    
to_df

[{'count': 596731},
 {'count': 13, 'housenumber': '66', 'street': 'Kanalveien'},
 {'count': 12, 'street': 'Lyngmarka'},
 {'count': 12, 'housenumber': '6', 'street': 'Valkendorfsgaten'},
 {'count': 11, 'housenumber': '37', 'street': 'Kalfarveien'},
 {'count': 10, 'housenumber': '64', 'street': 'Kanalveien'},
 {'count': 10, 'housenumber': '10', 'street': 'Sandslihaugen'},
 {'count': 10, 'housenumber': '3', 'street': 'Lilandsveien'},
 {'count': 9, 'housenumber': '62', 'street': 'Kanalveien'},
 {'count': 8, 'housenumber': '2', 'street': 'Vetrlidsallmenningen'},
 {'count': 8, 'housenumber': '2', 'street': 'Torget'},
 {'count': 7, 'housenumber': '4-10', 'street': 'Vilhelm Bjerknesvei'},
 {'count': 6, 'housenumber': '351', 'street': 'Lyderhornsveien'},
 {'count': 6, 'housenumber': '28', 'street': 'Inndalsveien'},
 {'count': 5, 'housenumber': '7', 'street': 'Østre Murallmenningen'},
 {'count': 5, 'housenumber': '13', 'street': 'Laguneveien'},
 {'count': 5, 'housenumber': '18', 'street': 'Stran

### Question for 1:1
OK to use Pandas?

In [41]:
#For easier review importing list to Pandas dataframe
import pandas as pd

df_duplicate_addresses = pd.DataFrame(to_df, columns=['street','housenumber','count'])

In [68]:
df_duplicate_addresses.sort_values('street')

#Counting number of duplicate housenumbers per street
df_duplicate_addresses.groupby('street').agg({'count':'count'}).sort_values('count',ascending=False)

Unnamed: 0_level_0,count
street,Unnamed: 1_level_1
Strandgaten,42
Storevarden,32
Kong Oscars gate,29
Marken,20
Storhaugen,20
Solåsen,18
Djupedalen,17
Fagerbakken,16
St. Hanshaugen,15
Liavegen,14


In [228]:
#Looking at the duplicate addresses of the street with the most duplicates
df_duplicate_addresses.where(df_duplicate_addresses['street'] == 'Strandgaten').dropna()

Unnamed: 0,street,housenumber,count
35,Strandgaten,18,4.0
42,Strandgaten,68,4.0
89,Strandgaten,74,3.0
198,Strandgaten,84,2.0
210,Strandgaten,77,2.0
213,Strandgaten,72,2.0
235,Strandgaten,3,2.0
295,Strandgaten,212,2.0
316,Strandgaten,201,2.0
332,Strandgaten,71,2.0


In [65]:
#Taking a look at one of the duplicate addresses of Standgaten
query = {'address.street': 'Strandgaten', 'address.housenumber': '74'}

for doc in bergen.find(query):
    pprint(doc)

{'_id': ObjectId('58664d29fff9a2e7e4dd6d59'),
 'address': {'city': 'Bergen',
             'housenumber': '74',
             'postcode': '5004',
             'street': 'Strandgaten'},
 'created': {'changeset': '36917201',
             'timestamp': '2016-01-31T13:57:43Z',
             'uid': '1965308',
             'user': 'FredrikLindseth',
             'version': '3'},
 'id': '2710063467',
 'pos': [60.3954429, 5.3178704],
 'type': 'node'}
{'_id': ObjectId('58664d2cfff9a2e7e4de77d9'),
 'address': {'city': 'Bergen',
             'housenumber': '74',
             'postcode': '5004',
             'street': 'Strandgaten'},
 'created': {'changeset': '26027382',
             'timestamp': '2014-10-12T15:17:26Z',
             'uid': '103253',
             'user': 'gormur',
             'version': '1'},
 'id': '3126092534',
 'pos': [60.395412, 5.3178227],
 'type': 'node'}
{'_id': ObjectId('58664d41fff9a2e7e4e4aa17'),
 'address': {'city': 'Bergen',
             'housenumber': '74',
             '

Looking at the 

Looking up the adress with the most duplicates (13), Kanalveien 66, on OpenStreetMap.org, it becomes apparent that the reason for the many duplicates is that there are multiple business located at that address, and each business seems to have gotten its own address. 

According to the [OSM wiki](http://wiki.openstreetmap.org/wiki/Addresses#How_to_map_addresses), the policy on duplicate addresses is unclear in such cases: "However, there is still some debate on that point (see for example Address information in POI *and* building? on help.openstreetmap.org). Also, the community in some countries has established their own rules."

According to the address page of the OSM wiki, in mid-2014 all Norwegian official addresses were released to the public. Efforts is being made by OSM volunteers to include the released data in OSM, and the progress is being tracked using a tool called [Beebeetle](http://osm.beebeetle.com/addrnodeimportstatus.php). As of January 7, 2017, the Bergen import is listed as 99.84% complete. 1 known address duplicate is listed on the site, for Solheimsgaten [SJEKK].

In [24]:
#INCOMPLETE Looking at contributors

tmp_agg = bergen.aggregate([  
        { "$group" : { 
                "_id" : { "uid": "$created.uid", "username": "$created.user" },"count" : { "$sum" : 1} } },
        { "$sort" : { "count" : -1 } },
         { "$limit" : 10 }
#         { "$project" : { "_id": 0, "user": "$created.user" } } 
    ])

for doc in tmp_agg:
    print(doc)

{'count': 140794, '_id': {'uid': '2114448', 'username': 'FredrikLindseth_import'}}
{'count': 133655, '_id': {'uid': '2836853', 'username': 'frokor_import'}}
{'count': 80243, '_id': {'uid': '103253', 'username': 'gormur'}}
{'count': 39789, '_id': {'uid': '992708', 'username': 'Christian Madsen'}}
{'count': 36440, '_id': {'uid': '722193', 'username': 'daviesp12'}}
{'count': 31427, '_id': {'uid': '170061', 'username': 'frokor'}}
{'count': 29969, '_id': {'uid': '1965308', 'username': 'FredrikLindseth'}}
{'count': 22168, '_id': {'uid': '715936', 'username': 'Gazer75'}}
{'count': 19287, '_id': {'uid': '3119148', 'username': 'cmeeren_import'}}
{'count': 16081, '_id': {'uid': '8313', 'username': 'gisle'}}


### Sources

SSB: https://ssb.no/befolkning/statistikker/familie/aar/2016-04-14  
SSB: https://www.ssb.no/befolkning/samordnet-statistikk-for-husholdninger-og-boliger  
Kartverket: http://www.seeiendom.no/

OSM resource links:  
http://wiki.openstreetmap.org/wiki/Addresses#How_to_map_addresses  
http://wiki.openstreetmap.org/wiki/Addresses#Norway  
http://osm.beebeetle.com/addrnodeimportstatus.php

OSM links:  
Kanalveien 66 http://www.openstreetmap.org/search?query=kanalveien%2066#map=19/60.36224/5.34696  
