In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load your dataset
db = pd.read_csv('datasets/bike_munich/rad_tage.csv')

# --- Data Cleaning ---
# Drop rows with missing values in critical columns
db = db.dropna(subset=['gesamt', 'min-temp', 'max-temp', 'niederschlag', 'sonnenstunden'])

# Convert 'datum' to datetime
db['datum'] = pd.to_datetime(db['datum'], format='%Y.%m.%d')

# --- Feature Engineering ---
# 'wochentag' (day of the week), 'monat' (month)
db['wochentag'] = db['datum'].dt.dayofweek  # 0 = Monday
db['monat'] = db['datum'].dt.month

# One-hot encode the 'zaehlstelle' column (stations)
db = pd.get_dummies(db, columns=['zaehlstelle'], drop_first=True)

# --- Define Features and Target ---
# Feature columns (including temp_diff, sun_to_cloud_ratio, etc.)
feature_cols = ['min-temp', 'max-temp', 'niederschlag', 'sonnenstunden', 'wochentag', 'monat'] + \
               [col for col in db.columns if col.startswith('zaehlstelle_')]  # Add the one-hot encoded station columns

X = db[feature_cols]
y = db['gesamt']

# --- Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Train RandomForestRegressor ---
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# --- Predictions ---
y_pred = model.predict(X_test)

# --- Evaluate the Model ---
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# --- Predict for Tomorrow ---
# Example for tomorrow's features
tomorrow = pd.DataFrame([{
    'min-temp': 8.5,
    'max-temp': 17.2,
    'niederschlag': 0.1,
    'sonnenstunden': 6.0,
    'wochentag': 2,  # Wednesday
    'monat': 4,
    **{col: 0 for col in X.columns if col.startswith('zaehlstelle_')}  # Set all station columns to 0
}])

# Optionally, set one station manually, for example, 'zaehlstelle_Hirsch'
tomorrow['zaehlstelle_Hirsch'] = 1  # If you want to predict for this station

# Make prediction
prediction = model.predict(tomorrow)
print(f"Tomorrow's Predicted Bike Traffic: {prediction[0]}")


Mean Squared Error: 502019.0050725123
R-squared: 0.8508173929701432
Tomorrow's Predicted Bike Traffic: 1809.49
