In [None]:
pip install faker osmnx



In [None]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime
import osmnx as ox
import networkx as nx
from geopy.distance import geodesic
from datetime import timedelta

In [None]:
# Initializing faker
fake = Faker()

In [None]:
#Setting the maximum amount of records and customers
number_of_trips = 5000
number_of_users = 500

## trip_hdr

In [None]:
# Generate unique order_ids
trip_ids = [fake.uuid4() for _ in range(number_of_trips)]

#Generating fake dates from today all the way back to Jan, 1st last year
start_date = datetime(datetime.now().year - 1, 1, 1)
end_date = datetime(datetime.now().year, datetime.now().month, datetime.now().day+1)
dates = [fake.date_between(start_date, end_date) for _ in range(number_of_trips)]

# Generate random customer_ids
users_ids_max = [fake.uuid4() for _ in range(number_of_users)]
user_ids = [random.choice(users_ids_max) for _ in range(number_of_trips)]

# Create a DataFrame
trip_data = pd.DataFrame({
    'trip_id': trip_ids,
    'trip_date': dates,
    'user_id': user_ids
})

print(trip_data.shape)
trip_data.head()

(5000, 3)


Unnamed: 0,trip_id,trip_date,user_id
0,520b6580-45ba-4977-9772-3f1cf2eceaa5,2023-10-12,587cba4a-7cce-4c98-a6f1-5ffdf35a3097
1,c0bc3044-c7fc-4737-a3f1-0dd537614855,2023-11-13,c870f4e2-4fb6-4519-91ef-ffdb27700e63
2,9c4fbe08-a7a7-4d64-ab41-c72e7f0cdd76,2024-08-09,98712c87-bdac-40f3-bf26-e24acc1892c3
3,797b65c5-7da2-468d-a134-fa43708d6835,2024-03-31,ff36dea6-40a9-4d27-b54c-6329eb69f2fb
4,c4a336e8-8637-4478-afcf-e3cdff016af4,2023-10-25,be5fce0b-90cb-49be-9e74-55f232fd22bb


## users

In [None]:
# Generate a unique user dataset based on the fake customer_id created for the orders dataset
num_users = len(set(user_ids))

# Generate random user names
user_names = [fake.name() for _ in range(num_users)]

# Generate random genders
genders = [random.choice(['Male', 'Female']) for _ in range(num_users)]
language = [random.choice(['English', 'French', 'Dutch']) for _ in range(num_users)]
member = [random.choice([True, False]) for _ in range(num_users)]
age = [random.randint(18, 50) for _ in range(num_users)]


vehicle_dataset = pd.read_csv('/content/vehicle_dataset.csv')
cars_uid = vehicle_dataset['vehicle_id'].tolist()


cars_uid_list = [random.choice(cars_uid) for _ in range(num_users)]

users_dataset = pd.DataFrame({
    'user_id': list(set(user_ids)),
    'user_name': user_names,
    'vehicle_id': cars_uid_list
})


print(users_dataset.shape)
users_dataset.head()

(500, 3)


Unnamed: 0,user_id,user_name,vehicle_id
0,3000458a-c28f-443f-8bf7-84427a4daeb5,Jacob Holland,82babb1d-b327-4a6d-9fef-9a2f43ed278c
1,e3e8e320-f049-45a7-bbaf-afef773b2165,Joshua Brown,46813ffe-8a4c-4431-bcd7-ac0745fe28a4
2,8d927c6c-6899-4708-a4b0-df0f4116222d,Janet Cummings,6450f876-985b-4602-9c27-385480963b23
3,fdca73f5-3f2b-4bbd-a205-0b441e36cf87,Brian Khan,f288b4a7-b827-46ec-8a4b-25b0a4fef44c
4,128652d7-3baa-4518-9b62-57918c803a47,Kimberly Ingram,f288b4a7-b827-46ec-8a4b-25b0a4fef44c


## Trip_line

In [None]:
# Function to generate trip data
def generate_trip_data(trip_id, start_time, duration_minutes, G):
    while True:
        # Randomly select start and end nodes
        start_node = np.random.choice(G.nodes)
        end_node = np.random.choice(G.nodes)

        try:
            # Try to find the shortest path between start_node and end_node
            path = nx.shortest_path(G, start_node, end_node, weight='length')

            # If the path is found, proceed with the rest of the function
            path_coords = [(G.nodes[node]['y'], G.nodes[node]['x']) for node in path]
            # Rest of your code remains unchanged

            # Break the loop if path is found
            break

        except nx.NetworkXNoPath:
            # If no path is found, print a message and retry with different nodes
            print(f"No path between {start_node} and {end_node}. Retrying...")
            continue

        # Calculate the total distance of the generated path
        total_distance = sum(
            geodesic(path_coords[i], path_coords[i + 1]).meters for i in range(len(path_coords) - 1)
        )

    num_points = len(path_coords)
    timestamps = [start_time + timedelta(minutes=i*(duration_minutes/num_points)) for i in range(num_points)]
    trip_df = pd.DataFrame({
        'trip_id': [trip_id] * num_points,
        'latitude': [coord[0] for coord in path_coords],
        'longitude': [coord[1] for coord in path_coords],
        'timestamp': timestamps
    })

    return trip_df

# Function to generate the full trip dataset
def generate_full_trip_dataset(trip_data, G):
    all_trips = []
    for index, row in trip_data.iterrows():
        trip_id = row['trip_id']
        start_time = pd.to_datetime(row['trip_date'])
        duration_minutes = np.random.randint(10, 120)
        trip_df = generate_trip_data(trip_id, start_time, duration_minutes, G)
        all_trips.append(trip_df)
    full_trip_data = pd.concat(all_trips, ignore_index=True)
    return full_trip_data

# Function to calculate distances between consecutive points
def add_trip_distances(df):

    def calculate_distance(row1, row2):
        coords_1 = (row1['latitude'], row1['longitude'])
        coords_2 = (row2['latitude'], row2['longitude'])
        return geodesic(coords_1, coords_2).meters

    # Initialize a new column for the distances
    df['distance_meters'] = 0.0

    # Iterate over the rows and calculate the distance
    for i in range(1, len(df)):
        if df.iloc[i]['trip_id'] == df.iloc[i-1]['trip_id']:
            df.at[i, 'distance_meters'] = calculate_distance(df.iloc[i-1], df.iloc[i])

    return df

In [None]:
# Step 1: Limit the Area to Antwerp
place_name = "Antwerp, Belgium"
G = ox.graph_from_place(place_name, network_type='drive')

In [None]:
# Step 1: Limit the Area to Antwerp
place_name = "Antwerp, Belgium"
G = ox.graph_from_place(place_name, network_type='drive')

# Generate the full trip dataset
trip_line_dataset = generate_full_trip_dataset(trip_data, G)

# Reset the index before grouping
trip_line_dataset.reset_index(drop=True, inplace=True)

trip_line_dataset = add_trip_distances(trip_line_dataset)

trip_line_dataset['signal_strength'] = np.random.randint(1, 5, size=trip_line_dataset.shape[0])
trip_line_dataset['city'] = 'Antwerp'
trip_line_dataset['country'] = 'Belgium'
trip_line_dataset['region'] = 'Flanders'

No path between 26126865 and 26194556. Retrying...
No path between 306482615 and 26153153. Retrying...
No path between 26152473 and 306484485. Retrying...
No path between 27467406 and 26021208. Retrying...
No path between 295672590 and 2044067169. Retrying...
No path between 312406521 and 26308464. Retrying...
No path between 26854477 and 306484489. Retrying...
No path between 252816230 and 26109346. Retrying...
No path between 21280952 and 26401931. Retrying...
No path between 320663391 and 258342572. Retrying...
No path between 206130654 and 7106153996. Retrying...
No path between 26316741 and 1419414465. Retrying...
No path between 26555097 and 9224805938. Retrying...
No path between 293908050 and 2789306313. Retrying...
No path between 6612939304 and 26384645. Retrying...
No path between 60474882 and 180456812. Retrying...
No path between 82647678 and 2205465848. Retrying...
No path between 36758234 and 21281161. Retrying...
No path between 1731904658 and 6612939285. Retrying...
No

## exports

In [None]:
filename = 'trip_data.csv'
trip_data.to_csv(filename, index=False)
files.download(filename)
trip_data.head()

In [None]:
filename = 'trip_line_data.csv'
trip_line_dataset.to_csv(filename, index=False)
files.download(filename)
trip_line_dataset.head()

In [None]:
filename = 'users_data.csv'
users_dataset.to_csv(filename, index=False)
files.download(filename)
users_dataset.head()

In [None]:
trip_and_users = pd.merge(trip_data, users_dataset, on='user_id', how='left')
users_and_vehicles = pd.merge(trip_and_users, vehicle_dataset, on='vehicle_id', how='left')
trip_full_data = pd.merge(users_and_vehicles, trip_line_dataset, on='trip_id', how='left')
trip_full_data.head()

Unnamed: 0,trip_id,trip_date,user_id,user_name,vehicle_id,vehicle_manufacturer,vehicle_name,vehicle_size,CO2/km,consumption/km,fuel_type,latitude,longitude,timestamp,distance_meters,signal_strength,city,country,region
0,520b6580-45ba-4977-9772-3f1cf2eceaa5,2023-10-12,587cba4a-7cce-4c98-a6f1-5ffdf35a3097,Alexis Carson,9f9bf984-a467-44d4-9127-702ec760ea5e,Kia,Rio,Small,115,5.1,Petrol,51.185003,4.339487,2023-10-12 00:00:00.000000,0.0,3,Antwerp,Belgium,Flanders
1,520b6580-45ba-4977-9772-3f1cf2eceaa5,2023-10-12,587cba4a-7cce-4c98-a6f1-5ffdf35a3097,Alexis Carson,9f9bf984-a467-44d4-9127-702ec760ea5e,Kia,Rio,Small,115,5.1,Petrol,51.184533,4.338863,2023-10-12 00:00:36.410256,68.13922,1,Antwerp,Belgium,Flanders
2,520b6580-45ba-4977-9772-3f1cf2eceaa5,2023-10-12,587cba4a-7cce-4c98-a6f1-5ffdf35a3097,Alexis Carson,9f9bf984-a467-44d4-9127-702ec760ea5e,Kia,Rio,Small,115,5.1,Petrol,51.184906,4.33815,2023-10-12 00:01:12.820513,64.821917,3,Antwerp,Belgium,Flanders
3,520b6580-45ba-4977-9772-3f1cf2eceaa5,2023-10-12,587cba4a-7cce-4c98-a6f1-5ffdf35a3097,Alexis Carson,9f9bf984-a467-44d4-9127-702ec760ea5e,Kia,Rio,Small,115,5.1,Petrol,51.185785,4.338261,2023-10-12 00:01:49.230769,98.187658,2,Antwerp,Belgium,Flanders
4,520b6580-45ba-4977-9772-3f1cf2eceaa5,2023-10-12,587cba4a-7cce-4c98-a6f1-5ffdf35a3097,Alexis Carson,9f9bf984-a467-44d4-9127-702ec760ea5e,Kia,Rio,Small,115,5.1,Petrol,51.185998,4.338455,2023-10-12 00:02:25.641026,27.226805,4,Antwerp,Belgium,Flanders


In [None]:
filename = 'trip_full_data.csv'
trip_full_data.to_csv(filename, index=False)
files.download(filename)
trip_full_data.head()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,trip_id,trip_date,user_id,user_name,vehicle_id,vehicle_manufacturer,vehicle_name,vehicle_size,CO2/km,consumption/km,fuel_type,latitude,longitude,timestamp,distance_meters,signal_strength,city,country,region
0,520b6580-45ba-4977-9772-3f1cf2eceaa5,2023-10-12,587cba4a-7cce-4c98-a6f1-5ffdf35a3097,Alexis Carson,9f9bf984-a467-44d4-9127-702ec760ea5e,Kia,Rio,Small,115,5.1,Petrol,51.185003,4.339487,2023-10-12 00:00:00.000000,0.0,3,Antwerp,Belgium,Flanders
1,520b6580-45ba-4977-9772-3f1cf2eceaa5,2023-10-12,587cba4a-7cce-4c98-a6f1-5ffdf35a3097,Alexis Carson,9f9bf984-a467-44d4-9127-702ec760ea5e,Kia,Rio,Small,115,5.1,Petrol,51.184533,4.338863,2023-10-12 00:00:36.410256,68.13922,1,Antwerp,Belgium,Flanders
2,520b6580-45ba-4977-9772-3f1cf2eceaa5,2023-10-12,587cba4a-7cce-4c98-a6f1-5ffdf35a3097,Alexis Carson,9f9bf984-a467-44d4-9127-702ec760ea5e,Kia,Rio,Small,115,5.1,Petrol,51.184906,4.33815,2023-10-12 00:01:12.820513,64.821917,3,Antwerp,Belgium,Flanders
3,520b6580-45ba-4977-9772-3f1cf2eceaa5,2023-10-12,587cba4a-7cce-4c98-a6f1-5ffdf35a3097,Alexis Carson,9f9bf984-a467-44d4-9127-702ec760ea5e,Kia,Rio,Small,115,5.1,Petrol,51.185785,4.338261,2023-10-12 00:01:49.230769,98.187658,2,Antwerp,Belgium,Flanders
4,520b6580-45ba-4977-9772-3f1cf2eceaa5,2023-10-12,587cba4a-7cce-4c98-a6f1-5ffdf35a3097,Alexis Carson,9f9bf984-a467-44d4-9127-702ec760ea5e,Kia,Rio,Small,115,5.1,Petrol,51.185998,4.338455,2023-10-12 00:02:25.641026,27.226805,4,Antwerp,Belgium,Flanders
