### 1. Setup & CSV einlesen


In [15]:
# -----------------------------------------------------------
# 1) Import necessary libraries
# -----------------------------------------------------------
import pandas as pd
import numpy as np
import re

# Pretty display options for DataFrames
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 150)

# -----------------------------------------------------------
# 2) Load the dataset (adjust filename if needed)
# -----------------------------------------------------------
df = pd.read_csv("berner_oberland.csv")

# First quick look
df.head()

Unnamed: 0,web_scraper_order,web_scraper_start_url,detail_url,title,location,difficulty_level,duration,distance,physical_demand,ascent,descent,target_group,season,recommended_period
0,1763920249-1,https://www.schweizer-wanderwege.ch/de/wanderv...,https://www.schweizer-wanderwege.ch/de/wanderv...,Alpen-OL auf dem Niederhorn,Berghaus Niederhorn,,45min,2.8km,tief,10m,365m,"Zyklus 1 (Grundschule / 6 - 9 Jahre alt), Zykl...",Sommer,
1,1763920251-2,https://www.schweizer-wanderwege.ch/de/wanderv...,https://www.schweizer-wanderwege.ch/de/wanderv...,Spielen am Bach und Bräteln im Suldtal,Spiez,,,2m,tief,,,"Zyklus 1 (Grundschule / 6 - 9 Jahre alt), Zykl...",Sommer,
2,1763920254-3,https://www.schweizer-wanderwege.ch/de/wanderv...,https://www.schweizer-wanderwege.ch/de/wanderv...,Schatzsuche Trail,Aeschi b. Spiez,,,1m,hoch,,,"Zyklus 1 (Grundschule / 6 - 9 Jahre alt), Zykl...",Sommer,
3,1763920256-4,https://www.schweizer-wanderwege.ch/de/wanderv...,https://www.schweizer-wanderwege.ch/de/wanderv...,Seilpark Interlaken,Interlaken,,,,hoch,,,"Zyklus 1 (Grundschule / 6 - 9 Jahre alt), Zykl...",Sommer,
4,1763920258-5,https://www.schweizer-wanderwege.ch/de/wanderv...,https://www.schweizer-wanderwege.ch/de/wanderv...,Besuch im Tipicamp Berner Oberland,Primarschule Aeschiried,,,,tief,,,"Zyklus 1 (Grundschule / 6 - 9 Jahre alt), Zykl...",Sommer,


### 2. Remove unnecessary scraper columns

In [16]:
# -----------------------------------------------------------
# Remove columns created automatically by the web-scraper
# These do not contain useful analytical information
# -----------------------------------------------------------
columns_to_drop = ["web_scraper_order", "web_scraper_start_url"]

df = df.drop(columns=columns_to_drop, errors="ignore")

# Rename the "detail_url" column to a simpler name
df = df.rename(columns={"detail_url": "url"})

df.head()


Unnamed: 0,url,title,location,difficulty_level,duration,distance,physical_demand,ascent,descent,target_group,season,recommended_period
0,https://www.schweizer-wanderwege.ch/de/wanderv...,Alpen-OL auf dem Niederhorn,Berghaus Niederhorn,,45min,2.8km,tief,10m,365m,"Zyklus 1 (Grundschule / 6 - 9 Jahre alt), Zykl...",Sommer,
1,https://www.schweizer-wanderwege.ch/de/wanderv...,Spielen am Bach und Bräteln im Suldtal,Spiez,,,2m,tief,,,"Zyklus 1 (Grundschule / 6 - 9 Jahre alt), Zykl...",Sommer,
2,https://www.schweizer-wanderwege.ch/de/wanderv...,Schatzsuche Trail,Aeschi b. Spiez,,,1m,hoch,,,"Zyklus 1 (Grundschule / 6 - 9 Jahre alt), Zykl...",Sommer,
3,https://www.schweizer-wanderwege.ch/de/wanderv...,Seilpark Interlaken,Interlaken,,,,hoch,,,"Zyklus 1 (Grundschule / 6 - 9 Jahre alt), Zykl...",Sommer,
4,https://www.schweizer-wanderwege.ch/de/wanderv...,Besuch im Tipicamp Berner Oberland,Primarschule Aeschiried,,,,tief,,,"Zyklus 1 (Grundschule / 6 - 9 Jahre alt), Zykl...",Sommer,


### 3. Remove duplicate routes using the unique URL

In [17]:
# -----------------------------------------------------------
# Remove duplicate hiking routes based on the URL.
# The URL uniquely identifies a hiking detail page.
# -----------------------------------------------------------
before = len(df)
df = df.drop_duplicates(subset=["url"])
after = len(df)

print("Rows before removing duplicates:", before)
print("Rows after:", after)


Rows before removing duplicates: 115
Rows after: 115


### 4. Convert distance to a numeric float (km)

In [21]:
# -----------------------------------------------------------
# Convert values like "11km" into the numeric value 11.0
# -----------------------------------------------------------

def parse_distance_km(x):
    """
    Convert strings such as '11km' or '11 km' to a float.
    If the value cannot be parsed, return NaN.
    """
    if pd.isna(x):
        return np.nan

    m = re.match(r"([\d.]+)\s*(km|m)$", str(x).lower())
    if not m:
        return np.nan

    value, unit = m.groups()
    return float(value) / 1000 if unit == "m" else float(value)

df["distance_km"] = df["distance"].apply(parse_distance_km)

df[["distance", "distance_km"]].head()


Unnamed: 0,distance,distance_km
0,2.8km,2.8
1,2m,0.002
2,1m,0.001
3,,
4,,


### 5. Convert duration into total minutes

In [5]:
# -----------------------------------------------------------
# Convert strings like "3h 15min" into total minutes.
# Example: "3h 15min" → 195 minutes
#          "2h" → 120
#          "45min" → 45
# -----------------------------------------------------------

def duration_to_minutes(x):
    if pd.isna(x):
        return np.nan
    
    text = str(x).lower().replace(" ", "")
    
    # Extract hours and minutes
    hours = 0
    minutes = 0

    h_match = re.search(r"(\d+)h", text)
    m_match = re.search(r"(\d+)min", text)

    if h_match:
        hours = int(h_match.group(1))
    if m_match:
        minutes = int(m_match.group(1))

    total = hours * 60 + minutes
    return total if total > 0 else np.nan

df["duration_min"] = df["duration"].apply(duration_to_minutes)

df[["duration", "duration_min"]].head()


Unnamed: 0,duration,duration_min
0,45min,45.0
1,,
2,,
3,,
4,,


### 6. Convert ascent & descent into integer meters

In [6]:
# -----------------------------------------------------------
# Convert values like "310m" into integer values such as 310
# -----------------------------------------------------------

def height_to_m(x):
    if pd.isna(x):
        return np.nan
    match = re.findall(r"\d+", str(x))
    return int(match[0]) if match else np.nan

df["ascent_m"]  = df["ascent"].apply(height_to_m)
df["descent_m"] = df["descent"].apply(height_to_m)

df[["ascent", "ascent_m", "descent", "descent_m"]].head()


Unnamed: 0,ascent,ascent_m,descent,descent_m
0,10m,10.0,365m,365.0
1,,,,
2,,,,
3,,,,
4,,,,


### 7. Split the location field into area, region, and canton

In [7]:
# -----------------------------------------------------------
# The 'location' field sometimes looks like:
# "Abländschen, Mittelberg — Saanen · BE"
#
# We split this into:
# - area        (left side of —)
# - region      (right side of —)
# - canton      (after the dot ·)
# -----------------------------------------------------------

def extract_canton(loc):
    if pd.isna(loc): return np.nan
    # Entferne Zeilenumbrüche und überflüssige Whitespaces
    loc_clean = ' '.join(str(loc).split())
    parts = loc_clean.split("·")
    return parts[-1].strip() if len(parts) > 1 else np.nan

def extract_region(loc):
    if pd.isna(loc): return np.nan
    loc_clean = ' '.join(str(loc).split())
    left = loc_clean.split("·")[0]
    if "—" in left:
        return left.split("—")[1].strip()
    return left.strip()

def extract_area(loc):
    if pd.isna(loc): return np.nan
    loc_clean = ' '.join(str(loc).split())
    left = loc_clean.split("·")[0]
    if "—" in left:
        return left.split("—")[0].strip()
    return left.strip()

df["canton"] = df["location"].apply(extract_canton)
df["region"] = df["location"].apply(extract_region)

df["area"]   = df["location"].apply(extract_area)

df[["location", "area", "region", "canton"]].head()

Unnamed: 0,location,area,region,canton
0,Berghaus Niederhorn,Berghaus Niederhorn,Berghaus Niederhorn,
1,Spiez,Spiez,Spiez,
2,Aeschi b. Spiez,Aeschi b. Spiez,Aeschi b. Spiez,
3,Interlaken,Interlaken,Interlaken,
4,Primarschule Aeschiried,Primarschule Aeschiried,Primarschule Aeschiried,


### 8. Encode difficulty & physical demand (categorical → numeric)

In [8]:
# -----------------------------------------------------------
# Convert categorical difficulty levels to numeric values
# Example: T1→1, T2→2, ..., T6→6
# WICHTIG: difficulty_level bleibt mit T-Präfix (z.B. "T2")
# -----------------------------------------------------------

difficulty_map = {
    "T1": 1, "T2": 2, "T3": 3,
    "T4": 4, "T5": 5, "T6": 6
}

# Normalisiere nur Whitespace, entferne NICHT das T
df["difficulty_level"] = df["difficulty_level"].astype(str).str.strip().str.replace(r'\s+', '', regex=True)
df["difficulty_num"] = df["difficulty_level"].map(difficulty_map)


df[["difficulty_level", "difficulty_num"]].head()

Unnamed: 0,difficulty_level,difficulty_num
0,,
1,,
2,,
3,,
4,,


In [9]:
# -----------------------------------------------------------
# Convert "leicht/mittel/hoch" to numeric 1/2/3
# -----------------------------------------------------------

physical_demand_map = {"leicht":1, "mittel":2, "hoch":3}

df["physical_demand"] = df["physical_demand"].astype(str).str.lower().str.strip()
df["physical_demand_num"] = df["physical_demand"].map(physical_demand_map)

df[["physical_demand", "physical_demand_num"]].head()


Unnamed: 0,physical_demand,physical_demand_num
0,tief,
1,tief,
2,hoch,3.0
3,hoch,3.0
4,tief,


### 9. Remove rows missing essential numeric data

In [10]:
# -----------------------------------------------------------
# For modelling, we need the core numerical variables:
# distance_km, duration_min, ascent_m, difficulty_num
# We drop rows where these values are missing.
# -----------------------------------------------------------

essential = ["distance_km", "duration_min", "ascent_m", "difficulty_num"]

before = len(df)
df_clean = df.dropna(subset=essential).copy()  # .copy() prevents SettingWithCopyWarning
after = len(df_clean)

print("Rows before dropping missing:", before)
print("Rows after:", after)


Rows before dropping missing: 115
Rows after: 61


### Open-Data-Enrichment with longitude & latitude

In [11]:
# -----------------------------------------------------------
# Geocode Swiss hiking locations using OpenStreetMap (Nominatim)
# Returns latitude and longitude for each unique area
# -----------------------------------------------------------

import requests
import time


# -----------------------------------------------------------
# Geocoding function (OpenStreetMap Nominatim)
# 
def geocode_osm(place):
    """
    Query OpenStreetMap Nominatim API to get latitude and longitude
    for a given place in Switzerland.
    Returns (lat, lon) or (None, None) if nothing is found.
    """
    if pd.isna(place):
        return None, None
    
    url = "https://nominatim.openstreetmap.org/search"
    params = {
        "q": f"{place}, Switzerland",
        "format": "json",
        "addressdetails": 1,
        "limit": 1
    }
    headers = {"User-Agent": "Mozilla/5.0"}

    try:
        response = requests.get(url, params=params, headers=headers)
        data = response.json()

        if len(data) == 0:
            return None, None
        
        lat = float(data[0]["lat"])
        lon = float(data[0]["lon"])
        return lat, lon

    except:
        return None, None


# -----------------------------------------------------------
# Apply geocoding (with pause to respect API rules)
# -----------------------------------------------------------

df_clean["latitude"] = None
df_clean["longitude"] = None

for idx, row in df_clean.iterrows():
    place = row["area"]

    lat, lon = geocode_osm(place)
    df_clean.at[idx, "latitude"] = lat
    df_clean.at[idx, "longitude"] = lon
    
    print(f"Geocoded: {place} → {lat}, {lon}")
    
    time.sleep(1)  #  avoid rate limiting by OSM


Geocoded: Schwenden i.D., Grimmialp • BE → None, None
Geocoded: Gschwandtenmaad • BE → None, None
Geocoded: Därstetten → None, None
Geocoded: Saxeten, Schulhaus → None, None
Geocoded: Kleine Scheidegg → None, None


KeyboardInterrupt: 

### 10. Save the cleaned dataset

In [53]:
# -----------------------------------------------------------
# Export the cleaned dataset so it can be used in the next steps
# (EDA, modelling, database storage, etc.)
# -----------------------------------------------------------

# Convert duration_min, ascent_m, and descent_m to integer for cleaner output
df_clean["duration_min"] = df_clean["duration_min"].astype(int)
df_clean["ascent_m"] = df_clean["ascent_m"].astype(int)
df_clean["descent_m"] = df_clean["descent_m"].astype(int)

# Bereinige location: entferne Zeilenumbrüche und überflüssige Whitespaces
df_clean["location_clean"] = df_clean["location"].apply(lambda x: ' '.join(str(x).split()) if pd.notna(x) else x)

# Extrahiere Canton aus der bereinigten Location (nach dem • Symbol)
def extract_canton_clean(loc):
    if pd.isna(loc): return np.nan
    if "•" in str(loc):
        return str(loc).split("•")[-1].strip()
    return np.nan

df_clean["canton_clean"] = df_clean["location_clean"].apply(extract_canton_clean)

# Entferne "• Canton" aus location, behalte nur die Ortsnamen
def clean_location_remove_canton(loc):
    if pd.isna(loc): return loc
    if "•" in str(loc):
        return str(loc).split("•")[0].strip()
    return str(loc).strip()

df_clean["location_clean"] = df_clean["location_clean"].apply(clean_location_remove_canton)

# Wähle nur die gewünschten Spalten aus
columns_to_export = [
    "url",
    "title", 
    "location_clean",
    "canton_clean",
    "difficulty_level",
    "duration_min",
    "distance_km",
    "ascent_m",
    "descent_m",
    "physical_demand",
    "latitude",
    "longitude"
]

df_export = df_clean[columns_to_export]

# Benenne Spalten für sauberes CSV
df_export = df_export.rename(columns={
    "location_clean": "location",
    "canton_clean": "canton"
})

output_file = "berner_oberland_clean_and_enriched.csv"
df_export.to_csv(output_file, index=False)

print("Clean data saved as:", output_file)
print(f"Exported {len(df_export)} rows with {len(columns_to_export)} columns")


Clean data saved as: berner_oberland_clean_and_enriched.csv
Exported 61 rows with 12 columns
