In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error
import joblib
import plotly.graph_objects as go
from sklearn.pipeline import Pipeline 

In [2]:
raw_parquet = pd.read_parquet('../data/raw/reunion_segments.parquet')
df_parquet = pd.DataFrame(raw_parquet)
df_parquet = df_parquet.rename(columns={"id": "segment_id"})
df_parquet_ride = df_parquet[df_parquet['activity_type'] == 'Ride']

In [4]:
raw_csv = pd.read_csv('../data/raw/reunion_segments.csv')

In [13]:
raw_manually_labeled = pd.read_csv('../data/processed/segments_manually_labeled.csv')
df_manually_labeled = pd.DataFrame(raw_manually_labeled)
df_manually_labeled_t1 = df_manually_labeled[df_manually_labeled['technicality'] == 1]

In [5]:
df = df_parquet_ride.merge(df_manually_labeled_t1, on='segment_id')

### Modèle basique

In [6]:
X = df.drop(columns=['best_time', 'name', 'average_top_10_time', 'tenth_best_time', 'activity_type', 'segment_id', 'technicality', 'altitude_profile', 'distance_profile', 'coordinates'])
y = df['best_time']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])
pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('scaler', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [30]:
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, r2_score


y_pred = pipeline.predict(X_test)
print(f'Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.2f} seconds')
print(f'Mean Absolute Percentage Error: {mean_absolute_percentage_error(y_test, y_pred)*100:.2f} %')
print(f'R^2 Score: {r2_score(y_test, y_pred):.2f}')

Mean Absolute Error: 38.08 seconds
Mean Absolute Percentage Error: 14.79 %
R^2 Score: 0.62


In [31]:
# plot actual vs predicted
fig = go.Figure()
fig.add_trace(go.Scatter(x=y_test, y=y_pred, mode='markers', name='Pred vs Act', marker=dict(color='blue', size=10, opacity=0.7, symbol='cross')))
fig.add_trace(go.Scatter(x=[y_test.min(), y_test.max()], y=[y_test.min(), y_test.max()], mode='lines', name='Ideal', line=dict(color='red', dash='dash')))
fig.update_layout(title='Actual vs Predicted Ride Times',   
                  xaxis_title='Actual Best Time (seconds)', 
                  yaxis_title='Predicted Best Time (seconds)')
fig.show()


### Modèle complexe