# Falschparker in Aachen, Daten von 2021

Danke an die anonyme Person die bei Frag den Staat die Nerven hat, die Stadt zu fragen: [Anfrage](https://fragdenstaat.de/anfrage/rohdaten-der-ordnungswidrigkeiten-im-ruhenden-verkehr-2022-1/)


In [1]:
import os
import pandas as pd
import numpy as np
from helper import clean_data, batch_geocode
from dotenv import load_dotenv
from tqdm import tqdm
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
# Read data
df = pd.read_csv("data/statistik.csv", sep=";")

# Clean data
df = clean_data(df)
print(df.head())
print(df.shape)

   Aktenzeichen      Tattag   Zeit                      Tatort 2  Tatb-Nr.  \
0      95026408  01.01.2022  10:40            Martelenberger Weg    112050   
1      95026419  01.01.2022  12:28                 Elsassplatz 8    141310   
2      95026418  01.01.2022  12:48                 Elsassplatz 4    141310   
3      95026465  01.01.2022  13:43           Lütticher Straße 23    142671   
4      95026410  01.01.2022  15:20  Aachen-Haaren, jüd. Ehrenmal    141164   

   Verwarn-/Bußgeld            poi              street number  \
0                50           None  Martelenberger Weg          
1                20           None         Elsassplatz      8   
2                20           None         Elsassplatz      4   
3                70           None    Lütticher Straße     23   
4                55  jüd. Ehrenmal       Aachen-Haaren          

         clean_address  
0   Martelenberger Weg  
1        Elsassplatz 8  
2        Elsassplatz 4  
3  Lütticher Straße 23  
4        Aachen

In [4]:
df.dropna(subset=["clean_address"], inplace=True)

# check how many unique addresses there are
uni = len(df["clean_address"].unique())
print(f"Unique addresses: {uni}")

# create a df to store the results and pick up where we left off
cache_path = "data/cached_results.pkl"
if not os.path.exists(cache_path):
    res_df = pd.DataFrame()
else:
    print("Found cached results. Reading from pickle to continue geocoding.")
    res_df = pd.read_pickle(cache_path)
    print(f"Found {len(res_df)} results in cache.")
    df = df[~df["Aktenzeichen"].isin(res_df["Aktenzeichen"])]
    print(f"Found {len(df)} addresses that are not in the cache.")

# check if we already have a csv with the results    
if os.path.exists("data/statistik_geocoded.csv"):
    print("Found a csv with the results. Reading from csv to continue geocoding.")
    csv_df = pd.read_csv("data/statistik_geocoded.csv")
    print(f"Found {len(csv_df)} results in csv.")
    df = df[~df["Aktenzeichen"].isin(csv_df["Aktenzeichen"])]
    print(f"Found {len(df)} addresses that are not in the csv.")
    res_df = pd.concat([res_df, csv_df], ignore_index=True).drop_duplicates(subset="Aktenzeichen")

# create batches
parts = np.array_split(df, 10000)
print(f'Split df into {len(parts)} batches of ~{len(parts[0])} elements.')
# Geocode data in batches
tqdm.pandas()
for part in tqdm(parts):
    temp = part.copy()
    res_df = pd.concat([res_df, batch_geocode(temp)], ignore_index=True).drop_duplicates(subset="Aktenzeichen")
    # save results after each batch to continue where we left off
    res_df.to_pickle(cache_path)
res_df.to_csv("data/statistik_geocoded.csv", index=False)

# remove cache when done
os.remove(cache_path)

print(res_df.shape)
print(res_df[['Aktenzeichen', 'lat', 'lon']].head())

Unique addresses: 16155
Found cached results. Reading from pickle to continue geocoding.
Found 442 results in cache.
Found 160723 addresses that are not in the cache.
Found a csv with the results. Reading from csv to continue geocoding.
Found 102 results in csv.
Found 160723 addresses that are not in the csv.
Split df into 10000 batches of ~17 elements.


  0%|          | 2/10000 [00:30<42:55:11, 15.45s/it]