In [2]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="geoapi")
location = geolocator.reverse("40.7128, -74.0060", exactly_one=True)
print(location.raw['address']['city'])  # Or 'town', 'village', etc.


AttributeError: 'NoneType' object has no attribute 'raw'

In [1]:
import pandas as pd

# Define the required columns
required_columns = [
    "name",
    "sport",
    "longitude",
    "latitude",
    "opening_hours",
    "contact:phone",
    "website",
    "addr:street",
    "addr:city",
    "addr:housenumber",
    "addr:postcode",
    "addr:country"
]

# Load both CSV files
france_df = pd.read_csv("fitness_centers_france.csv", low_memory=False)
germany_df = pd.read_csv("fitness_centers_germany.csv", low_memory=False)

# Fill missing 'addr:country' with 'FR' and 'DE'
france_df['addr:country'] = france_df['addr:country'].fillna('FR')
germany_df['addr:country'] = germany_df['addr:country'].fillna('DE')

# Select only the required columns
france_clean = france_df[required_columns]
germany_clean = germany_df[required_columns]

# Combine both cleaned datasets
combined_df = pd.concat([france_clean, germany_clean], ignore_index=True)

# Save to new CSV file
combined_df.to_csv("fitness_centers.csv", index=False)

print("✅ Data cleaned and saved to fitness_centers.csv")


✅ Data cleaned and saved to fitness_centers.csv


In [5]:
import pandas as pd

# Load the cleaned data
df = pd.read_csv("fitness_centers.csv", low_memory=False)

# Drop rows where 'name' is null
df = df.dropna(subset=["name"])

# Optionally reset index
df.reset_index(drop=True, inplace=True)

# Save the updated data back to the same file (or a new one if you prefer)
df.to_csv("fitness_centers.csv", index=False)

print("✅ Rows with null 'name' removed and file updated.")


✅ Rows with null 'name' removed and file updated.


In [1]:
import pandas as pd

# Load the cleaned data
df = pd.read_csv("fitness_centers.csv", low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15442 entries, 0 to 15441
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              9388 non-null   object 
 1   sport             10235 non-null  object 
 2   longitude         15442 non-null  float64
 3   latitude          15442 non-null  float64
 4   opening_hours     3726 non-null   object 
 5   contact:phone     773 non-null    object 
 6   website           3791 non-null   object 
 7   addr:street       15420 non-null  object 
 8   addr:city         15377 non-null  object 
 9   addr:housenumber  5294 non-null   object 
 10  addr:postcode     15441 non-null  float64
 11  addr:country      15442 non-null  object 
 12  processed         15442 non-null  bool   
dtypes: bool(1), float64(3), object(9)
memory usage: 1.4+ MB


In [11]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="geoapi")
location = geolocator.reverse("48.9625482, 2.2920216", language='de')
print(location.address)  # Full formatted address


Fitness Park, 163, Avenue Joffre, Le Cygne d'Enghien, Épinay-sur-Seine, Saint-Denis, Seine-Saint-Denis, Île-de-France, Metropolitanes Frankreich, 93800, Frankreich


In [22]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="geoapi")
location = geolocator.reverse("47.344106, 4.9997986", language='de')
print(location.address)  # Full formatted address


Basic-Fit, Route de Troyes, Daix, Dijon, Côte-d'Or, Burgund und Freigrafschaft, Metropolitanes Frankreich, 21121, Frankreich


### File and overwrite all address fields using nominatim and geopy

In [15]:
df.columns

Index(['name', 'sport', 'longitude', 'latitude', 'opening_hours',
       'contact:phone', 'website', 'addr:street', 'addr:city',
       'addr:housenumber', 'addr:postcode', 'addr:country', 'processed'],
      dtype='object')

In [1]:
import os
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import time

# === CONFIGURATION ===
INPUT_FILE = "fitness_centers.csv"
OUTPUT_FILE = "DE_fitness_centers.csv"
BATCH_SIZE = 500                 # Number of rows per batch
SLEEP_BETWEEN_BATCHES = 5        # Delay (seconds) between batches
USER_AGENT = "greenfitness_cleaner_batch"

# === LOAD DATA ===
if os.path.exists(OUTPUT_FILE):
    df = pd.read_csv(OUTPUT_FILE, low_memory=False)
else:
    df = pd.read_csv(INPUT_FILE, low_memory=False)

# Add a 'processed' column if not already present
if "processed" not in df.columns:
    df["processed"] = False

# === SET UP GEOCODER ===
geolocator = Nominatim(user_agent=USER_AGENT, timeout=10)
reverse = RateLimiter(lambda coords: geolocator.reverse(coords, language="de"), min_delay_seconds=1)

# === FUNCTION TO FILL ADDRESS DATA ===
def reverse_geocode(row):
    try:
        if pd.notnull(row["latitude"]) and pd.notnull(row["longitude"]):
            location = reverse((row["latitude"], row["longitude"]))
            if location and location.raw and "address" in location.raw:
                address = location.raw["address"]
                row["addr:street"] = address.get("road", "")
                row["addr:city"] = (
                    address.get("city") or
                    address.get("town") or
                    address.get("village") or
                    address.get("hamlet") or ""
                )
                row["addr:housenumber"] = address.get("house_number", "")
                row["addr:postcode"] = address.get("postcode", "")
    except Exception as e:
        print(f"⚠️ Error on row {row.name}: {e}")
    return row

# === PROCESS BATCHES ===
total_rows = len(df)
while True:
    unprocessed = df[df["processed"] == False]

    if unprocessed.empty:
        print("✅ All rows processed.")
        break

    # Take a batch
    batch = unprocessed.head(BATCH_SIZE)
    print(f"🟡 Processing batch {batch.index[0]}–{batch.index[-1]}")

    # Geocode the batch
    batch = batch.apply(reverse_geocode, axis=1)

    # Update original DataFrame with new values
    for col in ["addr:street", "addr:city", "addr:housenumber", "addr:postcode"]:
        df.loc[batch.index, col] = batch[col]
    df.loc[batch.index, "processed"] = True

    # Save progress to output file
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"✅ Batch saved to {OUTPUT_FILE}. Sleeping for {SLEEP_BETWEEN_BATCHES}s...\n")

    time.sleep(SLEEP_BETWEEN_BATCHES)

print("🎉 Finished all batches!")


🟡 Processing batch 13500–13999


 '37127' '33378' '61118' '76863' '76863' '76863' '71088' '13347' '13347'
 '23749' '65817' '13347' '13347' '13347' '13347' '27612' '47829' '19258'
 '21335' '24939' '55767' '14478' '25421' '81543' '80796' '37671' '71034'
 '31195' '49661' '80797' '98528' '73765' '88529' '89423' '22459' '88529'
 '88529' '88529' '88529' '50181' '06114' '95448' '53721' '61440' '04159'
 '10625' '80634' '04315' '04315' '34121' '27639' '12169' '45661' '10623'
 '95509' '95509' '95509' '95509' '47533' '90429' '95163' '95163' '95163'
 '95163' '12165' '10407' '10319' '31535' '63065' '47929' '21521' '59929'
 '74722' '54568' '39112' '53177' '90449' '33161' '83064' '24106' '70563'
 '99817' '15345' '87439' '68794' '55130' '06108' '35630' '60389' '64390'
 '85567' '90491' '36466' '44789' '56566' '97631' '97631' '97631' '97631'
 '97631' '97631' '66128' '27283' '72654' '18195' '63450' '94481' '94481'
 '94481' '75045' '75045' '91522' '54338' '61231' '47051' '80636' '41466'
 '41466' '41466' '41466' '81677' '61352' '96050' '3

✅ Batch saved to DE_fitness_centers.csv. Sleeping for 5s...

🟡 Processing batch 14000–14499
✅ Batch saved to DE_fitness_centers.csv. Sleeping for 5s...

🟡 Processing batch 14500–14999
✅ Batch saved to DE_fitness_centers.csv. Sleeping for 5s...

🟡 Processing batch 15000–15441
✅ Batch saved to DE_fitness_centers.csv. Sleeping for 5s...

✅ All rows processed.
🎉 Finished all batches!
