In [73]:
import pandas as pd
import joblib
from taxipred.utils.constants import TAXI_CSV_PATH

df = pd.read_csv(TAXI_CSV_PATH)
df.head()

Unnamed: 0,Trip_Distance_km,Time_of_Day,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,3.56,0.8,0.32,53.82,36.2624
1,36.87,Evening,2.7,1.21,0.15,37.27,52.9032
2,30.33,Evening,3.48,0.51,0.15,116.81,36.4698
3,8.64,Evening,2.93,0.63,0.32,22.64,15.618
4,8.64,Afternoon,2.55,1.71,0.48,89.33,60.2028


In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 864 entries, 0 to 863
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       864 non-null    float64
 1   Time_of_Day            864 non-null    object 
 2   Base_Fare              864 non-null    float64
 3   Per_Km_Rate            864 non-null    float64
 4   Per_Minute_Rate        864 non-null    float64
 5   Trip_Duration_Minutes  864 non-null    float64
 6   Trip_Price             864 non-null    float64
dtypes: float64(6), object(1)
memory usage: 47.4+ KB


In [75]:
df = pd.get_dummies(
    df,
    columns=['Time_of_Day'],
    dtype=int
)

df_dummies = df

In [76]:
# Dela upp features och target/label
X, y = df_dummies.drop("Trip_Price", axis="columns"), df_dummies["Trip_Price"]
X.head()

Unnamed: 0,Trip_Distance_km,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Time_of_Day_Afternoon,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night
0,19.35,3.56,0.8,0.32,53.82,0,0,1,0
1,36.87,2.7,1.21,0.15,37.27,0,1,0,0
2,30.33,3.48,0.51,0.15,116.81,0,1,0,0
3,8.64,2.93,0.63,0.32,22.64,0,1,0,0
4,8.64,2.55,1.71,0.48,89.33,1,0,0,0


In [77]:
y.head()

0    36.2624
1    52.9032
2    36.4698
3    15.6180
4    60.2028
Name: Trip_Price, dtype: float64

In [78]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)

X_train.shape, X_test.shape

((578, 9), (286, 9))

In [79]:
# Ska inte skala med RandomForest ( inte skala näver från björken )

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred[:10]

array([ 24.180529  , 246.45745006,  42.970597  ,  72.593387  ,
        23.671378  ,  49.177462  ,  27.453129  ,  28.583658  ,
        40.887542  ,  50.003612  ])

In [80]:
y_test[:10].values

array([ 18.7636    , 224.91466286,  35.3812    ,  81.8518    ,
        20.7765    ,  48.4142    ,  20.9951    ,  23.9563    ,
        46.316     ,  54.9343    ])

In [81]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae:.2f}')

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse:.2f}')

rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')

r2 = r2_score(y_test, y_pred)
print(f'R-squared (R²): {r2:.2f}')

Mean Absolute Error (MAE): 4.73
Mean Squared Error (MSE): 43.20
Root Mean Squared Error (RMSE): 6.57
R-squared (R²): 0.96


In [82]:
# Använd random forest feature importance
model.feature_importances_

array([8.24574476e-01, 5.98778438e-03, 8.23877012e-02, 2.85366693e-02,
       5.58962204e-02, 7.01886048e-04, 8.29925184e-04, 6.95750860e-04,
       3.89586986e-04])

In [83]:
df.columns

Index(['Trip_Distance_km', 'Base_Fare', 'Per_Km_Rate', 'Per_Minute_Rate',
       'Trip_Duration_Minutes', 'Trip_Price', 'Time_of_Day_Afternoon',
       'Time_of_Day_Evening', 'Time_of_Day_Morning', 'Time_of_Day_Night'],
      dtype='object')

In [84]:
feature_importance = pd.DataFrame([X.columns, model.feature_importances_])
feature_importance

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Trip_Distance_km,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Time_of_Day_Afternoon,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night
1,0.824574,0.005988,0.082388,0.028537,0.055896,0.000702,0.00083,0.000696,0.00039


In [85]:
# Transponera, rader bli kolumner o vise versa
feature_importance = feature_importance.T
feature_importance.columns = ["Feature", "Importance"]
feature_importance

Unnamed: 0,Feature,Importance
0,Trip_Distance_km,0.824574
1,Base_Fare,0.005988
2,Per_Km_Rate,0.082388
3,Per_Minute_Rate,0.028537
4,Trip_Duration_Minutes,0.055896
5,Time_of_Day_Afternoon,0.000702
6,Time_of_Day_Evening,0.00083
7,Time_of_Day_Morning,0.000696
8,Time_of_Day_Night,0.00039


## Trip_distance är det som har ÖVERLÄGSET störst påverkan

## TRÄNA NU PÅ ALL TESTDATA

In [88]:
# Kombinera X_train och X_test till en komplett dataset
X_full = df_dummies.drop("Trip_Price", axis="columns")
y_full = df_dummies["Trip_Price"]

# instans
final_model = RandomForestRegressor()

# Träna på all data
final_model.fit(X_full, y_full)

# Exportera
joblib.dump(final_model, 'random_forest_model.joblib')

['random_forest_model.joblib']