In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

In [3]:
raw_parquet = pd.read_parquet('../data/raw/reunion_segments.parquet')
df_parquet = pd.DataFrame(raw_parquet)

In [8]:
df_parquet['altitude_profile'].dtype

dtype('O')

In [11]:
#create a fonction that cut a segment by 100m portions and return a list of elevation gain for each portion
# then the portion should fill a model to predict the time for the whole segment
def cut_segment(altitude_profile, distance_profile, portion_length=100):
    portions = []
    
    # Convertir en numpy arrays si nécessaire
    alt_array = np.array(altitude_profile)
    dist_array = np.array(distance_profile)
    
    total_distance = dist_array[-1]
    num_portions = int(total_distance // portion_length) + 1
    
    for i in range(num_portions):
        start_distance = i * portion_length
        end_distance = min((i + 1) * portion_length, total_distance)
        
        mask = (dist_array >= start_distance) & (dist_array < end_distance)
        indices = np.where(mask)[0]
        
        if len(indices) > 0:
            elevation_gain = alt_array[indices[-1]] - alt_array[indices[0]]
            portions.append({
                'start_distance': start_distance,
                'end_distance': end_distance,
                'elevation_gain': elevation_gain
            })
    
    return portions

In [12]:
cut_segment(df_parquet['altitude_profile'].iloc[0], df_parquet['distance_profile'].iloc[0])

[{'start_distance': 0,
  'end_distance': 100,
  'elevation_gain': np.float64(-3.6000000000000014)},
 {'start_distance': 100,
  'end_distance': 200,
  'elevation_gain': np.float64(7.200000000000003)},
 {'start_distance': 200,
  'end_distance': 300,
  'elevation_gain': np.float64(3.6000000000000085)},
 {'start_distance': 300,
  'end_distance': 400,
  'elevation_gain': np.float64(5.400000000000006)},
 {'start_distance': 400,
  'end_distance': 500,
  'elevation_gain': np.float64(4.0)},
 {'start_distance': 500,
  'end_distance': 600,
  'elevation_gain': np.float64(4.599999999999994)},
 {'start_distance': 600,
  'end_distance': 700,
  'elevation_gain': np.float64(4.199999999999989)},
 {'start_distance': 700,
  'end_distance': 800,
  'elevation_gain': np.float64(5.0)},
 {'start_distance': 800,
  'end_distance': 900,
  'elevation_gain': np.float64(4.400000000000006)},
 {'start_distance': 900,
  'end_distance': 1000,
  'elevation_gain': np.float64(4.200000000000003)},
 {'start_distance': 1000,


### Modèle

In [14]:
# %% Function to extract features from a profile
def extract_features(profile):
    """Extract simple features from 100m elevation profile"""
    gains = [chunk['elevation_gain'] for chunk in profile]
    
    return {
        'total_distance': profile[-1]['end_distance'],
        'total_elevation_gain': sum(g for g in gains if g > 0),
        'total_elevation_loss': sum(abs(g) for g in gains if g < 0),
        'avg_grade': np.mean(gains) / 100 * 100,  # % moyen
        'max_gain': max(gains),
        'max_loss': min(gains),
    }

In [15]:
def build_segments_df(df_parquet):
    """Build segments_df with profile column from raw parquet data"""
    profiles = []
    
    for altitude, distance in zip(df_parquet['altitude_profile'], df_parquet['distance_profile']):
        profile = cut_segment(altitude, distance)
        profiles.append(profile)
    
    segments_df = df_parquet.copy()
    segments_df['profile'] = profiles
    
    return segments_df

In [17]:
# %% Prepare training data
def prepare_data(segments_df):
    """Prepare X and y from segments dataframe"""
    features_list = []
    times = []
    
    for _, row in segments_df.iterrows():
        profile = row['profile']
        time = row['average_top_10_time']
        
        if profile and time and not np.isnan(time):
            features = extract_features(profile)
            features_list.append(features)
            times.append(time)
    
    X = pd.DataFrame(features_list)
    y = np.array(times)
    
    return X, y

In [18]:
# %% Train model
def train_model(X, y):
    """Train a simple gradient boosting model"""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"MAE: {mae:.0f}s ({mae/60:.1f} min)")
    print(f"R²: {r2:.3f}")
    
    return model

In [19]:
# %% Predict time for new profile
def predict_time(model, profile):
    """Predict time for a new profile"""
    features = extract_features(profile)
    X = pd.DataFrame([features])
    return model.predict(X)[0]

In [None]:

# %% ============ USAGE ============

# 1. Load your data (adapt to your actual data structure)
segments_df = build_segments_df(df_parquet)

# 2. Prepare data
X, y = prepare_data(segments_df)

# 3. Train
model = train_model(X, y)

# 4. Save model
joblib.dump(model, '../src/models/time_predictor.joblib')

# 5. Predict on new profile
mon_profil = [
    {'start_distance': 0, 'end_distance': 100, 'elevation_gain': -3.6},
    {'start_distance': 100, 'end_distance': 200, 'elevation_gain': 7.2},
    {'start_distance': 200, 'end_distance': 300, 'elevation_gain': 3.6},
    {'start_distance': 300, 'end_distance': 400, 'elevation_gain': 5.4},
    {'start_distance': 400, 'end_distance': 500, 'elevation_gain': 4.0},
    # ... add more chunks
]


MAE: 151s (2.5 min)
R²: 0.923


In [22]:
predicted_time = predict_time(model, mon_profil)
print(f"Temps estimé: {predicted_time:.0f}s ({predicted_time/60:.1f} min)")

Temps estimé: 191s (3.2 min)
