### 1. Setup & CSV einlesen


In [15]:
# -----------------------------------------------------------
# 1) Import necessary libraries
# -----------------------------------------------------------
import pandas as pd
import numpy as np
import re
import sys
!{sys.executable} -m pip install sqlalchemy pymysql cryptography

# Pretty display options for DataFrames
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 150)

# -----------------------------------------------------------
# 2) Load the dataset (adjust filename if needed)
# -----------------------------------------------------------
df = pd.read_csv("schweizer_wanderwege_alle.csv")

# First quick look
df.head()

You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m


You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m


Unnamed: 0,web_scraper_order,web_scraper_start_url,detail_url,title,location,difficulty_level,duration,distance,physical_demand,ascent,descent,target_group,season,recommended_period
0,1766396370-1,https://www.schweizer-wanderwege.ch/de/wanderv...,https://www.schweizer-wanderwege.ch/de/wanderv...,Husky-Traum,Muotathal,,,4m,hoch,,,"Zyklus 1 (Grundschule / 6 - 9 Jahre alt), Zykl...",ganzj√§hrig,
1,1766396372-2,https://www.schweizer-wanderwege.ch/de/wanderv...,https://www.schweizer-wanderwege.ch/de/wanderv...,Aufregende Ausblicke auf dem Hochalp Trail,"Urn√§sch, Anker\n ...",,6h 45min,11.4km,hoch,720m,720m,,Winter,Dezember - M√§rz
2,1766396375-3,https://www.schweizer-wanderwege.ch/de/wanderv...,https://www.schweizer-wanderwege.ch/de/wanderv...,Den ganzen Tag im Schnee,Atzm√§nnig SSA\n ...,,1h 40min,4.2km,mittel,140m,510m,f√ºr Familien,Winter,Dezember - M√§rz
3,1766396377-4,https://www.schweizer-wanderwege.ch/de/wanderv...,https://www.schweizer-wanderwege.ch/de/wanderv...,Panorama Trail in Obwalden,Langis\n ...,,4h 30min,12.3km,hoch,565m,565m,,Winter,Dezember - M√§rz
4,1766396379-5,https://www.schweizer-wanderwege.ch/de/wanderv...,https://www.schweizer-wanderwege.ch/de/wanderv...,Schneeschuhwanderung auf den Chasseron,"Les Rasses, village\n ...",,4h 20min,10km,hoch,450m,450m,,Winter,Dezember - M√§rz


### 2. Remove unnecessary scraper columns

In [16]:
# -----------------------------------------------------------
# Remove columns created automatically by the web-scraper
# These do not contain useful analytical information
# -----------------------------------------------------------
columns_to_drop = ["web_scraper_order", "web_scraper_start_url"]

df = df.drop(columns=columns_to_drop, errors="ignore")

# Rename the "detail_url" column to a simpler name
df = df.rename(columns={"detail_url": "url"})

df.head()


Unnamed: 0,url,title,location,difficulty_level,duration,distance,physical_demand,ascent,descent,target_group,season,recommended_period
0,https://www.schweizer-wanderwege.ch/de/wanderv...,Husky-Traum,Muotathal,,,4m,hoch,,,"Zyklus 1 (Grundschule / 6 - 9 Jahre alt), Zykl...",ganzj√§hrig,
1,https://www.schweizer-wanderwege.ch/de/wanderv...,Aufregende Ausblicke auf dem Hochalp Trail,"Urn√§sch, Anker\n ...",,6h 45min,11.4km,hoch,720m,720m,,Winter,Dezember - M√§rz
2,https://www.schweizer-wanderwege.ch/de/wanderv...,Den ganzen Tag im Schnee,Atzm√§nnig SSA\n ...,,1h 40min,4.2km,mittel,140m,510m,f√ºr Familien,Winter,Dezember - M√§rz
3,https://www.schweizer-wanderwege.ch/de/wanderv...,Panorama Trail in Obwalden,Langis\n ...,,4h 30min,12.3km,hoch,565m,565m,,Winter,Dezember - M√§rz
4,https://www.schweizer-wanderwege.ch/de/wanderv...,Schneeschuhwanderung auf den Chasseron,"Les Rasses, village\n ...",,4h 20min,10km,hoch,450m,450m,,Winter,Dezember - M√§rz


### 3. Remove duplicate routes using the unique URL

In [17]:
# -----------------------------------------------------------
# Remove duplicate hiking routes based on the URL.
# The URL uniquely identifies a hiking detail page.
# -----------------------------------------------------------
before = len(df)
df = df.drop_duplicates(subset=["url"])
after = len(df)

print("Rows before removing duplicates:", before)
print("Rows after:", after)


Rows before removing duplicates: 375
Rows after: 375


### 4. Convert distance to a numeric float (km)

In [18]:
# -----------------------------------------------------------
# Convert values like "11km" into the numeric value 11.0
# -----------------------------------------------------------

def parse_distance_km(x):
    """
    Convert strings such as '11km' or '11 km' to a float.
    If the value cannot be parsed, return NaN.
    """
    if pd.isna(x):
        return np.nan

    m = re.match(r"([\d.]+)\s*(km|m)$", str(x).lower())
    if not m:
        return np.nan

    value, unit = m.groups()
    return float(value) / 1000 if unit == "m" else float(value)

df["distance_km"] = df["distance"].apply(parse_distance_km)

df[["distance", "distance_km"]].head()


Unnamed: 0,distance,distance_km
0,4m,0.004
1,11.4km,11.4
2,4.2km,4.2
3,12.3km,12.3
4,10km,10.0


### 5. Convert duration into total minutes

In [19]:
# -----------------------------------------------------------
# Convert strings like "3h 15min" into total minutes.
# Example: "3h 15min" ‚Üí 195 minutes
#          "2h" ‚Üí 120
#          "45min" ‚Üí 45
# -----------------------------------------------------------

def duration_to_minutes(x):
    if pd.isna(x):
        return np.nan
    
    text = str(x).lower().replace(" ", "")
    
    # Extract hours and minutes
    hours = 0
    minutes = 0

    h_match = re.search(r"(\d+)h", text)
    m_match = re.search(r"(\d+)min", text)

    if h_match:
        hours = int(h_match.group(1))
    if m_match:
        minutes = int(m_match.group(1))

    total = hours * 60 + minutes
    return total if total > 0 else np.nan

df["duration_min"] = df["duration"].apply(duration_to_minutes)

df[["duration", "duration_min"]].head()


Unnamed: 0,duration,duration_min
0,,
1,6h 45min,405.0
2,1h 40min,100.0
3,4h 30min,270.0
4,4h 20min,260.0


### 6. Convert ascent & descent into integer meters

In [20]:
# -----------------------------------------------------------
# Convert values like "310m" into integer values such as 310
# -----------------------------------------------------------

def height_to_m(x):
    if pd.isna(x):
        return np.nan
    match = re.findall(r"\d+", str(x))
    return int(match[0]) if match else np.nan

df["ascent_m"]  = df["ascent"].apply(height_to_m)
df["descent_m"] = df["descent"].apply(height_to_m)

df[["ascent", "ascent_m", "descent", "descent_m"]].head()


Unnamed: 0,ascent,ascent_m,descent,descent_m
0,,,,
1,720m,720.0,720m,720.0
2,140m,140.0,510m,510.0
3,565m,565.0,565m,565.0
4,450m,450.0,450m,450.0


### 7. Split the location field into area, region, and canton

In [21]:
# -----------------------------------------------------------
# The 'location' field sometimes looks like:
# "Abl√§ndschen, Mittelberg ‚Äî Saanen ‚Ä¢ BE"
#
# We split this into:
# - area        (left side of ‚Äî)
# - region      (right side of ‚Äî)
# - canton      (after the bullet ‚Ä¢)
# -----------------------------------------------------------

def extract_canton(loc):
    if pd.isna(loc): return np.nan
    # Entferne Zeilenumbr√ºche und √ºberfl√ºssige Whitespaces
    loc_clean = ' '.join(str(loc).split())
    parts = loc_clean.split("‚Ä¢")
    return parts[-1].strip() if len(parts) > 1 else np.nan

def extract_region(loc):
    if pd.isna(loc): return np.nan
    loc_clean = ' '.join(str(loc).split())
    left = loc_clean.split("‚Ä¢")[0]
    if "‚Äî" in left:
        return left.split("‚Äî")[1].strip()
    return left.strip()

def extract_area(loc):
    if pd.isna(loc): return np.nan
    loc_clean = ' '.join(str(loc).split())
    left = loc_clean.split("‚Ä¢")[0]
    if "‚Äî" in left:
        return left.split("‚Äî")[0].strip()
    return left.strip()

df["canton"] = df["location"].apply(extract_canton)
df["region"] = df["location"].apply(extract_region)

df["area"]   = df["location"].apply(extract_area)

df[["location", "area", "region", "canton"]].head()

Unnamed: 0,location,area,region,canton
0,Muotathal,Muotathal,Muotathal,
1,"Urn√§sch, Anker\n ...","Urn√§sch, Anker","Urn√§sch, Anker",AR
2,Atzm√§nnig SSA\n ...,Atzm√§nnig SSA,"Atzm√§nnig, Schutt",SG
3,Langis\n ...,Langis,Langis,OW
4,"Les Rasses, village\n ...","Les Rasses, village","Les Rasses, village",VD


### 8. Encode difficulty & physical demand (categorical ‚Üí numeric)

In [22]:
# -----------------------------------------------------------
# Convert categorical difficulty levels to numeric values
# Example: T1‚Üí1, T2‚Üí2, ..., T6‚Üí6
# WICHTIG: difficulty_level bleibt mit T-Pr√§fix (z.B. "T2")
# -----------------------------------------------------------

difficulty_map = {
    "T1": 1, "T2": 2, "T3": 3,
    "T4": 4, "T5": 5, "T6": 6
}

# Normalisiere nur Whitespace, entferne NICHT das T
df["difficulty_level"] = df["difficulty_level"].astype(str).str.strip().str.replace(r'\s+', '', regex=True)
df["difficulty_num"] = df["difficulty_level"].map(difficulty_map)


df[["difficulty_level", "difficulty_num"]].head()

Unnamed: 0,difficulty_level,difficulty_num
0,,
1,,
2,,
3,,
4,,


In [23]:
# -----------------------------------------------------------
# Convert "leicht/mittel/hoch" to numeric 1/2/3
# -----------------------------------------------------------

physical_demand_map = {"leicht":1, "mittel":2, "hoch":3}

df["physical_demand"] = df["physical_demand"].astype(str).str.lower().str.strip()
df["physical_demand_num"] = df["physical_demand"].map(physical_demand_map)

df[["physical_demand", "physical_demand_num"]].head()


Unnamed: 0,physical_demand,physical_demand_num
0,hoch,3.0
1,hoch,3.0
2,mittel,2.0
3,hoch,3.0
4,hoch,3.0


### 9. Remove rows missing essential numeric data

In [24]:
# -----------------------------------------------------------
# For modelling, we need the core numerical variables:
# distance_km, duration_min, ascent_m, difficulty_num
# We drop rows where these values are missing.
# -----------------------------------------------------------

essential = ["distance_km", "duration_min", "ascent_m", "difficulty_num"]

before = len(df)
df_clean = df.dropna(subset=essential).copy()  # .copy() prevents SettingWithCopyWarning
after = len(df_clean)

print("Rows before dropping missing:", before)
print("Rows after:", after)


Rows before dropping missing: 375
Rows after: 101


### Open-Data-Enrichment with longitude & latitude

In [None]:
# -----------------------------------------------------------
# Geocode Swiss hiking locations using OpenStreetMap (Nominatim)
# Returns latitude and longitude for each unique area
# -----------------------------------------------------------

import requests
import time


# -----------------------------------------------------------
# Geocoding function (OpenStreetMap Nominatim)
# 
def geocode_osm(place):
    """
    Query OpenStreetMap Nominatim API to get latitude and longitude
    for a given place in Switzerland.
    Returns (lat, lon) or (None, None) if nothing is found.
    """
    if pd.isna(place):
        return None, None
    
    url = "https://nominatim.openstreetmap.org/search"
    params = {
        "q": f"{place}, Switzerland",
        "format": "json",
        "addressdetails": 1,
        "limit": 1
    }
    headers = {"User-Agent": "Mozilla/5.0"}

    try:
        response = requests.get(url, params=params, headers=headers)
        data = response.json()

        if len(data) == 0:
            return None, None
        
        lat = float(data[0]["lat"])
        lon = float(data[0]["lon"])
        return lat, lon

    except:
        return None, None


# -----------------------------------------------------------
# Apply geocoding (with pause to respect API rules)
# -----------------------------------------------------------

df_clean["latitude"] = None
df_clean["longitude"] = None

for idx, row in df_clean.iterrows():
    place = row["area"]

    lat, lon = geocode_osm(place)
    df_clean.at[idx, "latitude"] = lat
    df_clean.at[idx, "longitude"] = lon
    
    print(f"Geocoded: {place} ‚Üí {lat}, {lon}")
    
    time.sleep(1)  #  avoid rate limiting by OSM


üîç Found 99 unique areas to geocode (instead of 101 rows)
[1/99] Geocoded: Veltheim AG, B√§ren ‚Üí None, None
[2/99] Geocoded: Bibern SH, Dorf ‚Üí None, None
[2/99] Geocoded: Bibern SH, Dorf ‚Üí None, None
[3/99] Geocoded: Zeihen, Stauftel ‚Üí None, None
[3/99] Geocoded: Zeihen, Stauftel ‚Üí None, None
[4/99] Geocoded: Messen, Dorfplatz ‚Üí None, None
[4/99] Geocoded: Messen, Dorfplatz ‚Üí None, None
[5/99] Geocoded: Lamone-Cadempino ‚Üí None, None
[5/99] Geocoded: Lamone-Cadempino ‚Üí None, None
[6/99] Geocoded: Gruy√®res ‚Üí None, None
[6/99] Geocoded: Gruy√®res ‚Üí None, None
[7/99] Geocoded: Mi√©court ‚Üí None, None
[7/99] Geocoded: Mi√©court ‚Üí None, None
[8/99] Geocoded: Huttwil ‚Üí None, None
[8/99] Geocoded: Huttwil ‚Üí None, None
[9/99] Geocoded: Malans GR ‚Üí None, None
[9/99] Geocoded: Malans GR ‚Üí None, None
[10/99] Geocoded: Gresso ‚Üí None, None
[10/99] Geocoded: Gresso ‚Üí None, None
[11/99] Geocoded: Weinfelden ‚Üí None, None
[11/99] Geocoded: Weinfelden ‚Üí None, N

### 10. Save the cleaned dataset

In [None]:
# -----------------------------------------------------------
# Export the cleaned dataset so it can be used in the next steps
# (EDA, modelling, database storage, etc.)
# -----------------------------------------------------------

# Convert duration_min, ascent_m, and descent_m to integer for cleaner output
df_clean["duration_min"] = df_clean["duration_min"].astype(int)
df_clean["ascent_m"] = df_clean["ascent_m"].astype(int)
df_clean["descent_m"] = df_clean["descent_m"].astype(int)

# Bereinige location: entferne Zeilenumbr√ºche und √ºberfl√ºssige Whitespaces
df_clean["location_clean"] = df_clean["location"].apply(lambda x: ' '.join(str(x).split()) if pd.notna(x) else x)

# Extrahiere Canton aus der bereinigten Location (nach dem ‚Ä¢ Symbol)
def extract_canton_clean(loc):
    if pd.isna(loc): return np.nan
    if "‚Ä¢" in str(loc):
        return str(loc).split("‚Ä¢")[-1].strip()
    return np.nan

df_clean["canton_clean"] = df_clean["location_clean"].apply(extract_canton_clean)

# Entferne "‚Ä¢ Canton" aus location, behalte nur die Ortsnamen
def clean_location_remove_canton(loc):
    if pd.isna(loc): return loc
    if "‚Ä¢" in str(loc):
        return str(loc).split("‚Ä¢")[0].strip()
    return str(loc).strip()

df_clean["location_clean"] = df_clean["location_clean"].apply(clean_location_remove_canton)

# W√§hle nur die gew√ºnschten Spalten aus
columns_to_export = [
    "url",
    "title", 
    "location_clean",
    "canton_clean",
    "difficulty_level",
    "duration_min",
    "distance_km",
    "ascent_m",
    "descent_m",
    "physical_demand",
    "latitude",
    "longitude"
]

df_export = df_clean[columns_to_export]

# Benenne Spalten f√ºr sauberes CSV
df_export = df_export.rename(columns={
    "location_clean": "location",
    "canton_clean": "canton"
})

print(f"‚úÖ Prepared {len(df_export)} rows with {len(columns_to_export)} columns for MySQL export")


‚úÖ Prepared 101 rows with 12 columns for MySQL export


### 11. Store data in MySQL database

In [None]:
# -----------------------------------------------------------
# MySQL connection setup
# Make sure Docker container is running: docker compose up -d
# -----------------------------------------------------------
from sqlalchemy import create_engine

# Database connection string
DB_USER = "root"
DB_PASSWORD = "password"
DB_HOST = "localhost"
DB_PORT = "3306"
DB_NAME = "wanderwege_db"

# Create database engine
connection_string = f"mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(connection_string)

# Test connection
try:
    with engine.connect() as conn:
        print("‚úÖ Successfully connected to MySQL database!")
        print(f"üìä Database: {DB_NAME}")
except Exception as e:
    print(f"‚ùå Connection failed: {e}")
    

‚úÖ Successfully connected to MySQL database!
üìä Database: wanderwege_db


In [None]:
# -----------------------------------------------------------
# Write cleaned and enriched data to MySQL
# -----------------------------------------------------------

try:
    # Write DataFrame to MySQL table
    df_export.to_sql(
        name='wanderwege',           # Table name in MySQL
        con=engine,                   # SQLAlchemy engine
        if_exists='replace',          # Drop table if exists and recreate
        index=False,                  # Don't write DataFrame index
        chunksize=1000,              # Write in batches of 1000 rows
        method='multi'                # Use multi-row INSERT for better performance
    )
    
    print(f"‚úÖ Successfully stored {len(df_export)} wanderwege in MySQL!")
    
    
    # Verify data was written correctly
    row_count_query = "SELECT COUNT(*) as count FROM wanderwege"
    result = pd.read_sql(row_count_query, con=engine)
    print(f"\n‚úì Verification: {result['count'][0]} rows in database")
    
except Exception as e:
    print(f"‚ùå Error writing to database: {e}")
    

‚úÖ Successfully stored 101 wanderwege in MySQL!

‚úì Verification: 101 rows in database
