In [1]:
import csv
import pandas as pd
import json
import time
import pyproj
import requests
from random import randint
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim

### 1. Get Addresses from geoportal.staedteregion

In [2]:
url = "https://geoportal.staedteregion-aachen.de/GnUrlProxy?url=32026040&btl=/json/search/strhsnr/strasse/2"
contents = requests.get(url)
aachen_streets = contents.json()
aachen_street_json_path = "aachen_street.json"
json.dump(aachen_streets, open(aachen_street_json_path, 'w')) # dont call it multiple times, so temporarily save data to json file

### 2. Grab Data from https://geoportal.staedteregion-aachen.de/GnUrlProxy?url=32026040&btl=/json/search/strhsnr/hausnummer/2/{stree_id}

In [6]:
# load the previously saved json
file = open(aachen_street_json_path, 'r', encoding='utf-8')
data = json.load(file)

csv_save_path = "aachen_house_no.csv"
csvfile = open(csv_save_path, 'w', newline='', encoding='utf-8')
writer = csv.writer(csvfile)
writer.writerow(["id", "street", "house_no", "geom"]) 

house_no_url_prefix = "https://geoportal.staedteregion-aachen.de/GnUrlProxy?url=32026040&btl=/json/search/strhsnr/hausnummer/2/"
for street in aachen_streets:
    time.sleep(1) # rest 1 seconds for each request
    house_no_contents = requests.get(house_no_url_prefix+street["id"])
    
    try:
      house_nos = house_no_contents.json()

    except json.JSONDecodeError:
      print(f'street id: {street["id"]} has no content')
      continue

    for item in house_nos:
      writer.writerow([street["id"], street["bez"], item["nr"], item["geom"]])

street id: =2550 has no content
street id: =2555 has no content
street id: =2564 has no content
street id: =2563 has no content
street id: =2552 has no content
street id: =2553 has no content


### 3. Get Longitude, Latitude, Zipcode

In [14]:
df = pd.read_csv("aachen_house_no.csv")
geolocator = Nominatim(user_agent="my_project")

# get the city's coordinates
def getLonLat(addr):
    location = geolocator.geocode(addr) # it returns timed out easily, so sleep 1~3 second(s) for each request
    time.sleep(randint(1, 3))
    if location == None:
        return 0, 0
    lat = location.latitude
    lon = location.longitude
    zipcode = location.raw.get("display_name").split(",")[-2].strip() # it changes from time to time
    return lon, lat, zipcode

# house_no "Anzeigen" or "0" are invalid
excluded_values = ["Anzeigen", "0"]
df = df[~df['house_no'].isin(excluded_values)]

df["full_addr"] = df["street"] + " " + df["house_no"]
lon_lat_zip = [getLonLat(addr) for addr in df["full_addr"]]
df["longitude"], df["latitude"], df["zip_code"]  = zip(*lon_lat_zip)  # unzip the lon_lat list and assign the list

### 4. Only Select One House No. per Zipcode

In [21]:
def get_unique_first_zip(zip_code):
    if zip_code not in exist_zips and zip_code != None and int(zip_code) > 0:
        exist_zips.append(zip_code)
        return 1
    return 0

exist_zips = []
final_df = pd.DataFrame()

df["is_selected"] = df['zip_code'].apply(get_unique_first_zip)

df = df[df['is_selected'] == 1]
df = df.drop(columns='is_selected')

# avoid .0 and sort by zip code
df["zip_code"] = df["zip_code"].astype(int)
df = df.sort_values("zip_code")

### (Optional, Only for Demo) Get Sample Data 

In [24]:
sample_num = 20

sample_df = df.sample(sample_num)
final_df = sample_df

### 5. Project Longitude, Latitude to X-Y Coordinates Using WGS 84 to Web Mercator

In [25]:
def lon_lat_to_xy(longitude, latitude):
    return projector.transform(longitude, latitude)

projector = pyproj.Transformer.from_crs("EPSG:4326", "EPSG:3857")  # WGS 84 to Web Mercator

# print the x, y coordinates
coordinates = final_df.apply(lambda col: lon_lat_to_xy(col['longitude'], col["latitude"]), axis=1)
x, y = zip(*coordinates)
final_df["x"] = x
final_df["y"] = y

### 6. Calculate Distance Using geopy.geocoders Lib

In [26]:
from geopy.distance import geodesic

def get_dis_from_DHL_geopy(lat, lon):
    coord1 = (50.9379349, 7.0579797)  # depot's latitude and longitude
    coord2 = (lat, lon)   # latitude and longitude of point 2
    return geodesic(coord1, coord2).km

final_df['distance_from_depot'] = final_df.apply(lambda x: get_dis_from_DHL_geopy(x['latitude'], x['longitude']), axis=1)
final_df = final_df.sort_values("zip_code")

In [27]:
final_df

Unnamed: 0,id,street,house_no,geom,full_addr,longitude,latitude,zip_code,x,y,distance_from_depot
835,2843,Akazienstraße,6,297794.56 5630904.517,Akazienstraße 6,13.323375,52.426767,12207,5836121.0,1496701.0,463.725469
2751,3639,Am Gutshof,21,297970.827 5631253.887,Am Gutshof 21,13.51615,52.572795,13059,5852377.0,1518763.0,481.402251
2900,1037,Am Hügel,3,293015.852 5629024.022,Am Hügel 3,13.315682,52.593958,13437,5854733.0,1495821.0,469.453742
787,1013,Ahornstraße,19,292993.955 5629586.007,Ahornstraße 19,13.684694,52.469807,15566,5840912.0,1538067.0,488.563164
2984,2703,Am Kreuz,27,293109.797 5633013.521,Am Kreuz 27,6.783467,51.3388,40489,5715009.0,756902.4,48.558992
830,2843,Akazienstraße,2 a,297841.043 5630881.203,Akazienstraße 2 a,6.853297,51.000667,50765,5677368.0,764731.2,15.981924
41,3200,Aachener Straße,209,298879.446 5622691.722,Aachener Straße 209,6.922277,50.936282,50931,5670201.0,772465.8,9.540684
1102,1015,Alexanderstraße,65,294917.159 5629244.6,Alexanderstraße 65,6.090822,50.778615,52062,5652650.0,679307.9,70.366874
2284,1029,Am Branderhof,7,296200.842 5627297.056,Am Branderhof 7,6.110228,50.761588,52066,5650754.0,681480.4,69.567405
597,2901,Adele-Weidtman-Straße,48,293310.153 5631750.087,Adele-Weidtman-Straße 48,6.066768,50.800572,52072,5655094.0,676615.1,71.429202


In [28]:
final_df.to_csv('aachen_zip_to_depot.csv')