In [1]:
import pandas as pd
import joblib
from taxipred.utils.constants import TAXI_CSV_PATH

df = pd.read_csv(TAXI_CSV_PATH)
df.head()

Unnamed: 0,Trip_Distance_km,Time_of_Day,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,3.56,0.8,0.32,53.82,36.2624
1,36.87,Evening,2.7,1.21,0.15,37.27,52.9032
2,30.33,Evening,3.48,0.51,0.15,116.81,36.4698
3,8.64,Afternoon,2.55,1.71,0.48,89.33,60.2028
4,30.45,Morning,2.77,1.78,0.34,110.33,94.4832


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       768 non-null    float64
 1   Time_of_Day            768 non-null    object 
 2   Base_Fare              768 non-null    float64
 3   Per_Km_Rate            768 non-null    float64
 4   Per_Minute_Rate        768 non-null    float64
 5   Trip_Duration_Minutes  768 non-null    float64
 6   Trip_Price             768 non-null    float64
dtypes: float64(6), object(1)
memory usage: 42.1+ KB


In [3]:
df = pd.get_dummies(
    df,
    columns=['Time_of_Day'],
    dtype=int
)

df_dummies = df

In [4]:
# Dela upp features och target/label
X, y = df_dummies.drop("Trip_Price", axis="columns"), df_dummies["Trip_Price"]
X.head()

Unnamed: 0,Trip_Distance_km,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Time_of_Day_Afternoon,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night
0,19.35,3.56,0.8,0.32,53.82,0,0,1,0
1,36.87,2.7,1.21,0.15,37.27,0,1,0,0
2,30.33,3.48,0.51,0.15,116.81,0,1,0,0
3,8.64,2.55,1.71,0.48,89.33,1,0,0,0
4,30.45,2.77,1.78,0.34,110.33,0,0,1,0


In [5]:
y.head()

0    36.2624
1    52.9032
2    36.4698
3    60.2028
4    94.4832
Name: Trip_Price, dtype: float64

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)

X_train.shape, X_test.shape

((514, 9), (254, 9))

In [7]:
# Ska inte skala med RandomForest ( inte skala näver från björken )

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred[:10]

array([58.349735, 61.23604 , 70.115799, 34.573525, 43.352508, 32.379012,
       44.762321, 44.971612, 57.608739, 29.446813])

In [8]:
y_test[:10].values

array([57.0755, 59.1773, 64.8642, 31.6941, 38.9965, 29.0584, 50.696 ,
       43.0697, 59.886 , 28.4048])

In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae:.2f}')

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse:.2f}')

rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')

r2 = r2_score(y_test, y_pred)
print(f'R-squared (R²): {r2:.2f}')

Mean Absolute Error (MAE): 4.60
Mean Squared Error (MSE): 47.18
Root Mean Squared Error (RMSE): 6.87
R-squared (R²): 0.95


In [10]:
# Använd random forest feature importance
model.feature_importances_

array([0.5928591 , 0.0084649 , 0.2121704 , 0.07701342, 0.10428036,
       0.00145993, 0.00138623, 0.00138359, 0.00098207])

In [11]:
df.columns

Index(['Trip_Distance_km', 'Base_Fare', 'Per_Km_Rate', 'Per_Minute_Rate',
       'Trip_Duration_Minutes', 'Trip_Price', 'Time_of_Day_Afternoon',
       'Time_of_Day_Evening', 'Time_of_Day_Morning', 'Time_of_Day_Night'],
      dtype='object')

In [12]:
feature_importance = pd.DataFrame([X.columns, model.feature_importances_])
feature_importance

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Trip_Distance_km,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Time_of_Day_Afternoon,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night
1,0.592859,0.008465,0.21217,0.077013,0.10428,0.00146,0.001386,0.001384,0.000982


In [13]:
# Transponera, rader bli kolumner o vise versa
feature_importance = feature_importance.T
feature_importance.columns = ["Feature", "Importance"]
feature_importance

Unnamed: 0,Feature,Importance
0,Trip_Distance_km,0.592859
1,Base_Fare,0.008465
2,Per_Km_Rate,0.21217
3,Per_Minute_Rate,0.077013
4,Trip_Duration_Minutes,0.10428
5,Time_of_Day_Afternoon,0.00146
6,Time_of_Day_Evening,0.001386
7,Time_of_Day_Morning,0.001384
8,Time_of_Day_Night,0.000982


## Trip_distance är det som har ÖVERLÄGSET störst påverkan

## TRÄNA NU PÅ ALL TESTDATA

In [14]:
# Kombinera X_train och X_test till en komplett dataset
X_full = df_dummies.drop("Trip_Price", axis="columns")
y_full = df_dummies["Trip_Price"]

# instans
final_model = RandomForestRegressor()

# Träna på all data
final_model.fit(X_full, y_full)

# Exportera
joblib.dump(final_model, 'random_forest_model_no_outliers.joblib')

['random_forest_model_no_outliers.joblib']