In [77]:
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error
from sklearn.preprocessing import LabelEncoder


In [78]:
# Fetch data
data = pd.read_csv(r"C:\Users\user\Projects\vehicle-tracking\vehicle-tracking\Vehicle-tracking-system-django\transport_scheduling\transport\management\commands\journey_data.csv")

In [79]:
df = pd.DataFrame(data)

In [80]:
df.head()

Unnamed: 0,distance_km,start_lat,start_lon,end_lat,end_lon,weather,temperature,traffic_level,day_of_week,hour_of_day,journey_duration_min
0,33.003689,0.275185,29.056996,-0.298742,29.189881,Rainy,18.815241,Medium,Weekday,8,116
1,40.152417,0.573357,29.887398,0.055954,29.750613,Sunny,29.312507,High,Weekend,0,13
2,28.254934,-1.824687,29.160735,-0.384112,29.265877,Sunny,26.712154,High,Weekend,11,88
3,31.037244,-0.469407,29.31014,-1.123676,29.805186,Sunny,23.411148,Medium,Weekend,19,45
4,20.834707,-0.649282,29.636201,0.835812,29.723839,Sunny,15.093913,Medium,Weekday,17,119


In [65]:
df.shape


(500, 11)

In [66]:
df.columns.tolist()

['distance_km',
 'start_lat',
 'start_lon',
 'end_lat',
 'end_lon',
 'weather',
 'temperature',
 'traffic_level',
 'day_of_week',
 'hour_of_day',
 'journey_duration_min']

In [67]:
df.isnull().sum()


distance_km             0
start_lat               0
start_lon               0
end_lat                 0
end_lon                 0
weather                 0
temperature             0
traffic_level           0
day_of_week             0
hour_of_day             0
journey_duration_min    0
dtype: int64

In [68]:
# Define base speed assumptions (km/min)
speed_map = {
    "Low": np.random.uniform(0.7, 1.2),  # Fast speed (fewer delays)
    "Medium": np.random.uniform(0.4, 0.7),
    "High": np.random.uniform(0.2, 0.4),  # Slow speed (high congestion)
}

In [69]:
# Convert categorical traffic levels to speed
df["traffic_speed"] = df["traffic_level"].map(speed_map)


In [70]:
# Adjust duration calculation using distance and speed
df["journey_duration_min"] = df["distance_km"] / df["traffic_speed"]


In [81]:
# Add weather impact
weather_impact = {"Sunny": 1.0, "Cloudy": 1.2, "Rainy": 1.5}  # Rain slows travel
df["journey_duration_min"] *= df["weather"].map(weather_impact)

In [82]:
df = pd.get_dummies(df, columns=["weather", "traffic_level", "day_of_week"], drop_first=True)
df.drop(columns=["start_lat", "start_lon", "end_lat", "end_lon"], inplace=True)
df.columns.tolist()

['distance_km',
 'temperature',
 'hour_of_day',
 'journey_duration_min',
 'weather_Rainy',
 'weather_Sunny',
 'traffic_level_Low',
 'traffic_level_Medium',
 'day_of_week_Weekend']

In [74]:
# Features and target variable
X = df.drop(columns=["journey_duration_min"])
y = df["journey_duration_min"]

In [75]:
#Splitting the data into features and target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Model training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [29]:
# Model evaluation
predictions = model.predict(X_test)
print("Mean Absolute Error:", mean_absolute_error(y_test, predictions))
print("Mean Squared Error:", mean_squared_error(y_test, predictions))
print("r2_score:",r2_score(y_test,predictions))

Mean Absolute Error: 3.704836194580155
Mean Squared Error: 31.198873485153317
r2_score: 0.9915503850351735


In [56]:
# Save the model and label encoders
joblib.dump(model, "journey_duration_model.pkl")

['journey_duration_model.pkl']