In [None]:
import numpy as np
import pandas as pd

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
sample_target_df = pd.read_csv('/kaggle/input/2025-bike-availability-prediction-summer-edition/sample_submission_2025.csv')
sample_target_df.head()

In [None]:
sample_features_df = pd.read_csv('/kaggle/input/2025-bike-availability-prediction-summer-edition/metadata_sample_submission_2025.csv')
sample_features_df = sample_features_df.rename(columns={"ctx-4": "ctx_4", "ctx-3": "ctx_3", "ctx-2": "ctx_2", "ctx-1": "ctx_1"})
print(sample_features_df.shape)
sample_features_df.head()

In [None]:
historical_df = pd.read_csv('/kaggle/input/historical-metrics-2024-2024-csv/historical_metrics_FINAL_2023-2024.csv')
print(historical_df.shape)

# Remove outliers
historical_df = historical_df.query("ctx_4 >= 0 and ctx_4 <= 1 and ctx_3 >= 0 and ctx_3 <= 1 and ctx_2 >= 0 and ctx_2 <= 1 and ctx_1 >= 0 and ctx_1 <= 1 and percentage_docks_available >= 0 and percentage_docks_available <= 1")
print("historical_df", historical_df.shape)

historical_df.head()

# historical_df = historical_df.head(100)

In [None]:
stations_df = pd.read_parquet('/kaggle/input/stations-df/stations_df.parquet')
print("stations_df", stations_df.shape)
stations_df.head()

In [None]:
def get_holidays_df() -> pd.DataFrame:
    holidays = ['2020-01-01', '2020-01-06', '2020-04-10', '2020-04-13', '2020-05-01', '2020-06-01', '2020-06-24', '2020-08-15', '2020-09-11', '2020-09-24', '2020-10-12', '2020-11-01', '2020-12-06', '2020-12-08', '2020-12-25', '2020-12-26', '2021-01-01', '2021-01-06', '2021-04-02', '2021-04-05', '2021-05-01', '2021-05-24', '2021-06-24', '2021-08-15', '2021-09-11', '2021-09-24', '2021-10-12', '2021-11-01', '2021-12-06', '2021-12-08', '2021-12-25', '2021-12-26', '2022-01-01', '2022-01-06', '2022-04-15', '2022-04-18', '2022-05-01', '2022-06-06', '2022-06-24', '2022-08-15', '2022-09-11', '2022-09-24', '2022-10-12', '2022-11-01', '2022-12-06', '2022-12-08', '2022-12-25', '2022-12-26', '2023-01-01', '2023-01-06', '2023-04-07', '2023-04-10', '2023-05-01', '2023-06-05', '2023-06-24', '2023-08-15', '2023-09-11', '2023-09-24', '2023-10-12', '2023-11-01', '2023-12-06', '2023-12-08', '2023-12-25', '2023-12-26', '2024-01-01', '2024-01-06', '2024-03-29', '2024-04-01', '2024-05-01', '2024-05-20', '2024-06-24', '2024-08-15', '2024-09-11', '2024-09-24', '2024-10-12', '2024-11-01', '2024-12-06', '2024-12-08', '2024-12-25', '2024-12-26', '2025-01-01', '2025-01-06', '2025-04-18', '2025-04-21', '2025-05-01', '2025-06-09', '2025-06-24', '2025-08-15', '2025-09-11', '2025-09-24', '2025-10-12', '2025-11-01', '2025-12-06', '2025-12-08', '2025-12-25', '2025-12-26']
    year_col: list[int] = []
    month_col: list[int]  = []
    day_col: list[int]  = []
    for formatted_date in holidays:
        year, month, day = formatted_date.split('-')
        year_col.append(int(year))
        month_col.append(int(month))
        day_col.append(int(day))
    holidays_df = pd.DataFrame(data={"year": year_col, "month": month_col, "day": day_col})
    holidays_df["is_holiday"] = True
    return holidays_df

In [None]:
import calendar
from datetime import datetime, timezone
from zoneinfo import ZoneInfo

def enrich_input(input_df: pd.DataFrame) -> pd.DataFrame:
    if "year" not in input_df.columns:
        input_df["year"] = 2025
    elif "timetuple" in input_df.columns:
        raise RuntimeError("Reset the input df as it's already been enriched.")

    # Adjust time to timezone and add day_of_week and day_of_year_norm as features.
    madrid_tz = ZoneInfo("Europe/Madrid")

    input_df["timetuple"] = input_df.apply(
        lambda row: datetime(int(row["year"]), int(row["month"]), int(row["day"]), int(row["hour"]), tzinfo=timezone.utc).astimezone(tz=madrid_tz).timetuple(), axis=1
    )
    input_df.drop(columns=["year", "month", "day", "hour"], inplace=True)
    input_df["day_of_week"] = input_df["timetuple"].apply(
        lambda timetuple: timetuple.tm_wday
    )
    input_df["day_of_year_norm"] = input_df["timetuple"].apply(
        lambda timetuple: timetuple.tm_yday / (366 if calendar.isleap(timetuple.tm_year) else 365)
    )
    input_df["year"] = input_df["timetuple"].apply(
        lambda timetuple: timetuple.tm_year
    )
    input_df["month"] = input_df["timetuple"].apply(
        lambda timetuple: timetuple.tm_mon
    )
    input_df["day"] = input_df["timetuple"].apply(
        lambda timetuple: timetuple.tm_mday
    )
    input_df["hour"] = input_df["timetuple"].apply(
        lambda timetuple: timetuple.tm_hour
    )

    # Add is_day_off feature for holidays and weekends
    holidays_df = get_holidays_df()
    input_df = pd.merge(input_df, holidays_df, how="left", on=["year", "month", "day"])
    input_df["is_holiday"] = input_df["is_holiday"].fillna(False)
    input_df["is_day_off"] = (input_df["day_of_week"].isin([5,6]) | input_df["is_holiday"]).astype(int)

    # Add stations features
    input_df = pd.merge(input_df, stations_df, how="left", on=["station_id"])

    return input_df

historical_df = enrich_input(historical_df)

historical_df.head()

In [None]:
historical_df.query("year == 2024 and month == 11 and day == 1 and hour == 1").head()

In [None]:
sample_features_df = enrich_input(sample_features_df)
print("Enriched sample features:", sample_features_df.shape)
sample_features_df.head()

In [None]:
# sample_features_df[["station_id"]].value_counts()

In [None]:
from sklearn.model_selection import train_test_split

feature_columns = ['angle_center', 'dist_center_norm_minimax', 'is_day_off', 'ctx_4', 'ctx_3', 'ctx_2', 'ctx_1']

X = historical_df[feature_columns]
y = historical_df["percentage_docks_available"]

# Split in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

X_test = sample_features_df[feature_columns]
y_test = sample_target_df["percentage_docks_available"]

print(f"Training set: {X_train.shape[0]} samples")

print(f"Training target set: {y_test.shape[0]} samples")

print(f"Test set: {X_test.shape[0]} samples")

print(f"Test target set: {y_test.shape[0]} samples")

print(f"% training: {X_train.shape[0]/len(historical_df)*100:.1f}%")

print(f"% test: {X_test.shape[0]/len(historical_df)*100:.1f}%")

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
y_test.head()

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

gbr_model.fit(X_train, y_train)

y_pred = gbr_model.predict(X_test)

In [None]:
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error: ", mse)
print("Root Mean Squared Error: ", rmse)
print("Mean Absolute Error: ", mae)
print("R2 score: ", r2)

In [None]:
results_df = pd.DataFrame(y_pred).reset_index().rename(columns={0:"percentage_docks_available"})
print(f"{results_df.min()=} {results_df.max()=}")
results_df.head()

In [None]:
print("Total predictions:", len(results_df))
print("Total < 0:", len(results_df.query("percentage_docks_available < 0")))
print("Total > 1:", len(results_df.query("percentage_docks_available > 1")))

In [None]:
results_df.to_csv("submission.csv", index=False)