In [1]:
import pandas as pd
import numpy as np
import os
import mysql.connector
from mysql.connector import Error
from dotenv import load_dotenv
from datetime import datetime, timedelta, time
import random, string
from faker import Faker
fake = Faker()

In [2]:
base_dir = os.getcwd()

In [3]:
#  Read csv files
users = pd.read_csv(f"{base_dir}\\data\\users.csv")
vehicles = pd.read_csv(f"{base_dir}\\data\\vehicles.csv")
transactions = pd.read_csv(f"{base_dir}\\data\\rental_transactions.csv")

In [4]:
users['update_at'] = users['creation_date']

### import csv to mysql (users, vehicles, transactions)

In [5]:
def generate_insert_into(df, table_name="A"):
    columns = df.columns.tolist()
    columns_str = ", ".join(columns)
    placeholders = ", ".join(["%s"] * len(columns))
    insert_stmt = f"INSERT INTO {table_name} ({columns_str}) VALUES ({placeholders});"
    return insert_stmt

In [309]:
# query create table for users, vehicles, and transaction
query_create_table_users = """
CREATE TABLE users (
    user_id VARCHAR(50),
    first_name VARCHAR(255),
    last_name VARCHAR(255),
    email VARCHAR(255),
    phone_number VARCHAR(255),
    driver_license_number VARCHAR(50),
    driver_license_expiry DATE,
    creation_date DATE,
    is_active TINYINT(1),
    update_at DATE
);
"""

query_create_table_vehicles = """
CREATE TABLE vehicles (
    active TINYINT(1),
    vehicle_license_number VARCHAR(255),
    registration_name TEXT,
    license_type VARCHAR(255),
    expiration_date VARCHAR(50),
    permit_license_number VARCHAR(255),
    certification_date DATE,
    vehicle_year YEAR,
    base_telephone_number VARCHAR(255),
    base_address TEXT,
    vehicle_id VARCHAR(50),
    last_update_timestamp VARCHAR(50),
    brand VARCHAR(255),
    vehicle_type VARCHAR(255)
);
"""

query_create_table_transactions = """
CREATE TABLE transactions (
    rental_id VARCHAR(50),
    user_id VARCHAR(50),
    vehicle_id VARCHAR(50),
    rental_start_time DATETIME,
    rental_end_time DATETIME,
    pickup_location VARCHAR(255),
    dropoff_location VARCHAR(255),
    total_amount FLOAT
);
"""


In [310]:
# query insert table
query_insert_table_users = generate_insert_into(users, 'users')
query_insert_table_vehicle = generate_insert_into(vehicles, 'vehicles')
query_insert_table_transaction = generate_insert_into(transactions, 'transactions')

In [311]:
# Load .env
load_dotenv()

# Read MySQL connection info
hostname = os.getenv("MYSQL_HOST")
port = int(os.getenv("MYSQL_PORT"))
username = os.getenv("MYSQL_USER")
password = os.getenv("MYSQL_PASSWORD")

try:
    conn = mysql.connector.connect(
        host=hostname,
        user=username,
        password=password,
        port=port
    )
    
    if conn.is_connected():
        print("[INFO] connect to mysql success")
        cursor = conn.cursor()
        cursor.execute("CREATE DATABASE IF NOT EXISTS rental_vehicle;")
        cursor.execute("USE rental_vehicle;")
        cursor.execute("SELECT DATABASE();")
        record = cursor.fetchone()
        print(f"[INFO] you're connected to {record[0]} database")

        cursor.execute("DROP TABLE IF EXISTS users;")
        print("[INFO] drop table users if exist success")
        cursor.execute("DROP TABLE IF EXISTS vehicles;")
        print("[INFO] drop table vehicles if exist success")
        cursor.execute("DROP TABLE IF EXISTS transactions;")
        print("[INFO] drop table transactions if exist success")

        cursor.execute(query_create_table_users)
        print("[INFO] Create table user success")
        cursor.execute(query_create_table_vehicles)
        print("[INFO] Create table vehicles success")
        cursor.execute(query_create_table_transactions)
        print("[INFO] Create table transactions success")

        batch_size = 5000
        tables = {
            'users': users,
            'vehicles': vehicles,
            'transactions': transactions
        }
        insert_query = {
            'users': query_insert_table_users,
            'vehicles': query_insert_table_vehicle,
            'transactions': query_insert_table_transaction
        }
        for tbl in tables.keys():
            num_batches = len(tables[tbl]) // batch_size + 1
            for i in range(num_batches):
                start_idx = i * batch_size
                end_idx = ((i+1) * batch_size)
                batch_data = tables[tbl].iloc[start_idx:end_idx]
                batch_data_record = [tuple(row) for row in batch_data.to_numpy()]

                cursor.executemany(insert_query[tbl], batch_data_record)
                conn.commit()
                print(f"[INFO] insert batch {i+1}/{num_batches} success")

            cursor.execute(f"SELECT COUNT(*) FROM {tbl};")
            count_data = cursor.fetchone()
            print(f"[INFO] all {count_data[0]} {tbl} data inserted successfully")

except Error as e:
    print("Error while connecting to MySQL", e)

finally:
    if conn.is_connected():
        cursor.close()
        conn.close()
        print("MySQL connection is closed")


[INFO] connect to mysql success
[INFO] you're connected to rental_vehicle database
[INFO] drop table users if exist success
[INFO] drop table vehicles if exist success
[INFO] drop table transactions if exist success
[INFO] Create table user success
[INFO] Create table vehicles success
[INFO] Create table transactions success
[INFO] insert batch 1/7 success
[INFO] insert batch 2/7 success
[INFO] insert batch 3/7 success
[INFO] insert batch 4/7 success
[INFO] insert batch 5/7 success
[INFO] insert batch 6/7 success
[INFO] insert batch 7/7 success
[INFO] all 30000 users data inserted successfully
[INFO] insert batch 1/22 success
[INFO] insert batch 2/22 success
[INFO] insert batch 3/22 success
[INFO] insert batch 4/22 success
[INFO] insert batch 5/22 success
[INFO] insert batch 6/22 success
[INFO] insert batch 7/22 success
[INFO] insert batch 8/22 success
[INFO] insert batch 9/22 success
[INFO] insert batch 10/22 success
[INFO] insert batch 11/22 success
[INFO] insert batch 12/22 success


In [312]:
len(users), len(vehicles), len(transactions)

(30000, 109584, 20080)

### cleaning location files

In [6]:
locs = pd.read_csv(f"{base_dir}\\data\\locations.csv")

In [23]:
locs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   location_id    300 non-null    int64  
 1   location_name  300 non-null    object 
 2   address        300 non-null    object 
 3   city           300 non-null    object 
 4   state          300 non-null    object 
 5   zip_code       300 non-null    int64  
 6   latitude       300 non-null    float64
 7   longitude      300 non-null    float64
dtypes: float64(2), int64(2), object(4)
memory usage: 18.9+ KB


In [24]:
locs.head(3)

Unnamed: 0,location_id,location_name,address,city,state,zip_code,latitude,longitude
0,2702,"Jackson, Velazquez and Gonzales",3140 Heath Radial Apt. 604,Modesto,CA,94540,86.25802,-169.2448
1,4380,Bean LLC,51144 Patrick Isle Suite 397,Fontana,CA,92188,-74.455893,-42.279882
2,7709,Gilbert-Simmons,4738 Lewis Locks,Roseville,CA,91032,-65.430931,-64.763489


In [25]:
locs.isnull().sum()

location_id      0
location_name    0
address          0
city             0
state            0
zip_code         0
latitude         0
longitude        0
dtype: int64

In [26]:
locs.duplicated(subset=['location_id']).sum()

np.int64(0)

In [7]:
locs_new = locs.copy()
locs_new['location_name'] = locs['location_name'].apply(lambda x: x.lower().strip())
locs_new['address'] = locs['address'].apply(lambda x: x.lower().strip())
locs_new['city'] = locs['city'].apply(lambda x: x.lower().strip())
locs_new['state'] = locs['state'].apply(lambda x: x.lower().strip())

In [28]:
locs_new['state'].value_counts()

state
ca    300
Name: count, dtype: int64

In [29]:
locs_new.duplicated().sum()

np.int64(0)

location.csv is already clean. 

In [17]:
locs_new.to_csv(os.path.join(os.path.dirname(base_dir), "airflow\\include\\dbt_rent_vhc\\seeds\\", "raw_location_enrichment.csv"), index=False)

### generate new dummy data and update few data from existing data

In [11]:
# users -> creation_date
users_update = users.copy()

In [12]:
users[(users['creation_date'] == '2024-01-01') & (users['is_active'] == 1)]

Unnamed: 0,user_id,first_name,last_name,email,phone_number,driver_license_number,driver_license_expiry,creation_date,is_active,update_at
270,9ee562dc26,Amy,Weaver,amy.weaver@gmail.com,031-961-6726,ER081473,2033-10-01,2024-01-01,1,2024-01-01
433,a01f0bf908,Stacy,Marshall,stacy.marshall@gmail.com,+1-799-043-7200x0615,FY799991,2031-06-11,2024-01-01,1,2024-01-01
982,5485af3edb,Tina,Aguirre,tina.aguirre@yahoo.com,+1-619-767-4566x012,JK833411,2027-04-02,2024-01-01,1,2024-01-01
1075,fd2f1e130e,Kathleen,Hernandez,kathleen.hernandez@gmail.com,+1-974-178-1306x56176,PP657728,2031-06-14,2024-01-01,1,2024-01-01
1351,4ad1b5d96d,Jamie,Barajas,jamie.barajas@hotmail.com,342-109-4049,BX386488,2024-10-30,2024-01-01,1,2024-01-01
...,...,...,...,...,...,...,...,...,...,...
29396,a379e5e6ba,Kyle,Medina,kyle.medina@gmail.com,134.023.1167x8906,FV757499,2029-02-27,2024-01-01,1,2024-01-01
29496,485444df1c,Brian,White,brian.white@hotmail.com,999-975-2832x0489,XR862218,2027-11-11,2024-01-01,1,2024-01-01
29532,7c3c5e3fa2,Steven,Parks,steven.parks@gmail.com,(032)985-3458,SG498196,2027-12-01,2024-01-01,1,2024-01-01
29624,c04e1a9af0,Gabriel,Roy,gabriel.roy@yahoo.com,322-812-4764x56998,BZ468258,2028-04-29,2024-01-01,1,2024-01-01


In [13]:
# update few users data to check snapshots dbt later
users_update['update_at'] = np.where((users_update['creation_date'] == '2024-01-01') & (users_update['is_active'] == 1 ), 
                                                    datetime.today().date().strftime('%Y-%m-%d'), 
                                                    users_update['update_at'])
users_update['is_active'] = np.where(users_update['creation_date'] == '2024-01-01', 0, 1)
target_users = users_update[users_update['update_at'] == datetime.today().date().strftime('%Y-%m-%d')][['user_id', 'is_active', 'update_at']]

In [14]:
target_users

Unnamed: 0,user_id,is_active,update_at
270,9ee562dc26,0,2025-07-06
433,a01f0bf908,0,2025-07-06
982,5485af3edb,0,2025-07-06
1075,fd2f1e130e,0,2025-07-06
1351,4ad1b5d96d,0,2025-07-06
...,...,...,...
29396,a379e5e6ba,0,2025-07-06
29496,485444df1c,0,2025-07-06
29532,7c3c5e3fa2,0,2025-07-06
29624,c04e1a9af0,0,2025-07-06


In [15]:
users_update[users_update['update_at'] == datetime.today().date().strftime('%Y-%m-%d')]

Unnamed: 0,user_id,first_name,last_name,email,phone_number,driver_license_number,driver_license_expiry,creation_date,is_active,update_at
270,9ee562dc26,Amy,Weaver,amy.weaver@gmail.com,031-961-6726,ER081473,2033-10-01,2024-01-01,0,2025-07-06
433,a01f0bf908,Stacy,Marshall,stacy.marshall@gmail.com,+1-799-043-7200x0615,FY799991,2031-06-11,2024-01-01,0,2025-07-06
982,5485af3edb,Tina,Aguirre,tina.aguirre@yahoo.com,+1-619-767-4566x012,JK833411,2027-04-02,2024-01-01,0,2025-07-06
1075,fd2f1e130e,Kathleen,Hernandez,kathleen.hernandez@gmail.com,+1-974-178-1306x56176,PP657728,2031-06-14,2024-01-01,0,2025-07-06
1351,4ad1b5d96d,Jamie,Barajas,jamie.barajas@hotmail.com,342-109-4049,BX386488,2024-10-30,2024-01-01,0,2025-07-06
...,...,...,...,...,...,...,...,...,...,...
29396,a379e5e6ba,Kyle,Medina,kyle.medina@gmail.com,134.023.1167x8906,FV757499,2029-02-27,2024-01-01,0,2025-07-06
29496,485444df1c,Brian,White,brian.white@hotmail.com,999-975-2832x0489,XR862218,2027-11-11,2024-01-01,0,2025-07-06
29532,7c3c5e3fa2,Steven,Parks,steven.parks@gmail.com,(032)985-3458,SG498196,2027-12-01,2024-01-01,0,2025-07-06
29624,c04e1a9af0,Gabriel,Roy,gabriel.roy@yahoo.com,322-812-4764x56998,BZ468258,2028-04-29,2024-01-01,0,2025-07-06


In [16]:
# generate dummy users (created_date == '2025-07-05' or 'today')

In [17]:
def generate_random_alphanumeric():
    """
    Generates a random alphanumeric string of a specified length.

    Args:
        length (int): The desired length of the alphanumeric string.

    Returns:
        str: A random alphanumeric string.
    """
    letters = random.choices(string.ascii_uppercase, k=3)
    digits = random.choices(string.digits, k=7)
    combined = letters + digits
    random.shuffle(combined)  # agar urutannya acak
    return ''.join(combined).lower()

def generate_random_license_number(length):
    """
    Generates a random license_number.

    Args:
        length (int): The desired length of the alphanumeric string.

    Returns:
        str: A random license_number string.
    """
    char_digit = string.digits
    char_letter = string.ascii_letters
    license_num = ''
    for idx in range(length):
        if idx < 2:
            license_num.append(random.choice(char_letter))

    random_string = ''.join( )
    return random_string

exists_id = users_update['user_id'].values
exists_license = users_update['driver_license_number'].values
data = []

for i in range(500):
    new_id = generate_random_alphanumeric()
    new_license = ''.join(random.choices(string.ascii_uppercase, k=2)) + ''.join(random.choices(string.digits, k=6))
    while new_id in exists_id:
        new_id = generate_random_alphanumeric(10)
    while new_license in exists_license:
        new_license = ''.join(random.choices(string.ascii_uppercase, k=2)) + ''.join(random.choices(string.digits, k=6))
    
    data.append({
        'user_id':new_id,
        'first_name':fake.first_name(),
        'last_name':fake.last_name(),
        'email':fake.free_email(),
        'phone_number':fake.phone_number(),
        'driver_license_number':new_license,
        'driver_license_expiry':fake.date_between(start_date=(datetime.today() + timedelta(days=30)), end_date="+5y"),
        'creation_date':datetime.today().date(),
        'is_active':1,
        'update_at':datetime.today().date()
    })

users_new = pd.DataFrame(data)

In [18]:
users_new

Unnamed: 0,user_id,first_name,last_name,email,phone_number,driver_license_number,driver_license_expiry,creation_date,is_active,update_at
0,81a6c0599l,Christopher,Contreras,martinezchelsea@hotmail.com,711.417.4462,ZU291624,2027-02-10,2025-07-06,1,2025-07-06
1,m581601jt9,Leslie,Phillips,melaniebean@hotmail.com,(363)916-1989x803,CJ939203,2026-03-11,2025-07-06,1,2025-07-06
2,00hz69417z,James,Chaney,daviddavis@hotmail.com,+1-857-794-2782x62029,MN754413,2029-01-21,2025-07-06,1,2025-07-06
3,dl0694128n,Darryl,Dalton,dawn88@gmail.com,383-711-0169,US081734,2027-01-25,2025-07-06,1,2025-07-06
4,3186lu3b67,Jason,Kaiser,singheric@gmail.com,938.653.8201,BS463694,2026-05-09,2025-07-06,1,2025-07-06
...,...,...,...,...,...,...,...,...,...,...
495,k644y7d765,Veronica,Williams,williamjohnson@yahoo.com,001-736-563-9385x775,NS504568,2027-09-26,2025-07-06,1,2025-07-06
496,k855451en3,Robert,Allen,collinskeith@hotmail.com,001-256-595-1321x17971,WZ214211,2028-10-22,2025-07-06,1,2025-07-06
497,0bmo050524,Margaret,Williams,johnsonemily@hotmail.com,(816)973-7730x4643,VG532926,2026-01-12,2025-07-06,1,2025-07-06
498,pt7573730t,Tami,Lambert,hscott@hotmail.com,8346526470,EZ027182,2028-05-09,2025-07-06,1,2025-07-06


In [19]:
# vehicle -> last_update_timestamp
vehicles_update = vehicles.copy()

In [20]:
# update few vehicles data to check snapshots dbt later
vehicles_update['active'] = np.where(vehicles_update['vehicle_year'] < 2005, 0, 1)
vehicles_update['last_update_timestamp'] = np.where(vehicles_update['vehicle_year'] < 2005, 
                                                    datetime.today().date().strftime('%d-%m-%Y %H:%M:%S'), 
                                                    vehicles_update['last_update_timestamp'])

In [21]:
vehicles_update

Unnamed: 0,active,vehicle_license_number,registration_name,license_type,expiration_date,permit_license_number,certification_date,vehicle_year,base_telephone_number,base_address,vehicle_id,last_update_timestamp,brand,vehicle_type
0,1,5818886,"CITY,LIVERY,LEASING,QUEENS,INC",FOR HIRE VEHICLE,27-09-2025,6EPABCVK,2018-01-09,2018,(646)780-0129,1515 THIRD STREET SAN FRANCISCO CA 94158,67789f742d,04-06-2024 13:25:00,Ferrari,high_end
1,1,5520432,"FERNANDEZ,JOSE,A",FOR HIRE VEHICLE,08-01-2026,IC0VQ8EC,2015-01-21,2015,(646)780-0129,1515 THIRD STREET SAN FRANCISCO CA 94158,70e8c42e4f,04-06-2024 13:25:00,BMW,premium
2,1,5790608,"RIGO,LIMO-AUTO,CORP",FOR HIRE VEHICLE,19-06-2025,AGTGT62I,2020-03-31,2020,(646)780-0129,1515 THIRD STREET SAN FRANCISCO CA 94158,aa2522d199,04-06-2024 13:25:00,Toyota,basic
3,1,6045671,"NARZIEV,LAZIZJON",FOR HIRE VEHICLE,22-11-2025,OO9QLG6E,2022-11-09,2022,(646)780-0129,1515 THIRD STREET SAN FRANCISCO CA 94158,0984531ace,04-06-2024 13:25:00,Chevrolet,basic
4,1,6022074,"YAQOOB,SAAD",FOR HIRE VEHICLE,05-04-2025,3U109JZC,2018-11-29,2018,(646)780-0129,1515 THIRD STREET SAN FRANCISCO CA 94158,1ee2538be7,04-06-2024 13:25:00,Tesla,high_end
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109579,1,6065873,"CHILI, ALI",FOR HIRE VEHICLE,22-02-2026,MP2PD4HA,2020-03-29,2020,(646)780-0129,1515 THIRD STREET SAN FRANCISCO CA 94158,74da19206a,04-06-2024 13:25:00,Tesla,high_end
109580,1,5855007,"TAHIR,MUJAHID,H",FOR HIRE VEHICLE,14-03-2026,IQCCBY2F,2022-01-27,2022,(646)780-0129,1515 THIRD STREET SAN FRANCISCO CA 94158,31f70351ee,04-06-2024 13:25:00,Bentley,high_end
109581,1,5831173,"GUJJAR,LIMO,CORP",FOR HIRE VEHICLE,22-11-2025,C1LCTG9E,2020-03-02,2020,(646)780-0129,1515 THIRD STREET SAN FRANCISCO CA 94158,0cdebc2455,04-06-2024 13:25:00,Audi,premium
109582,1,5887235,"SOW,MAMADOU,A",FOR HIRE VEHICLE,27-07-2024,AA2B6RR9,2018-09-13,2018,(646)780-0129,1515 THIRD STREET SAN FRANCISCO CA 94158,2dd64b79ce,04-06-2024 13:25:00,Lexus,premium


In [22]:
target_vehicles = vehicles_update[vehicles_update['vehicle_year'] < 2005][['vehicle_id', 'active', 'last_update_timestamp']]
target_vehicles

Unnamed: 0,vehicle_id,active,last_update_timestamp
5993,70296652d6,0,06-07-2025 00:00:00
13159,30b38297ca,0,06-07-2025 00:00:00
28553,96f631816d,0,06-07-2025 00:00:00
39021,f487245499,0,06-07-2025 00:00:00
64113,e36a77f267,0,06-07-2025 00:00:00
66404,26bdfba082,0,06-07-2025 00:00:00
68504,ee7f0b0aa0,0,06-07-2025 00:00:00
71916,3d436b95d0,0,06-07-2025 00:00:00
74692,f4a46a6bc1,0,06-07-2025 00:00:00
81179,91f1cf6877,0,06-07-2025 00:00:00


In [23]:
def generate_random_permit_license():
    letters = random.choices(string.ascii_uppercase, k=6)
    digits = random.choices(string.digits, k=2)
    combined = letters + digits
    random.shuffle(combined)  # agar urutannya acak
    return ''.join(combined)

In [24]:
# generate dummy vehicles (last_update_timestamp == '2025-07-04' or 'today')
exists_id = vehicles_update['vehicle_id'].values
exists_license = vehicles_update['vehicle_license_number'].values
exists_permit = vehicles_update['permit_license_number'].values

data = []

for i in range(50):
    new_id = generate_random_alphanumeric()
    new_license = ''.join(random.choices(string.digits, k=7))
    new_permit = generate_random_permit_license()
    new_cert_date = fake.date_between(start_date="-4y", end_date=datetime.today().date())
    while new_id in exists_id:
        new_id = generate_random_alphanumeric(10)
    while new_license in exists_license:
        new_license = ''.join(random.choices(string.digits, k=7))
    while new_permit in exists_permit:
        new_permit = generate_random_permit_license()
    
    data.append({
        'active':1,
        'vehicle_license_number':new_license,
        'registration_name':random.choice(vehicles_update['registration_name'].value_counts().index),
        'license_type':random.choice(vehicles_update['license_type'].value_counts().index),
        'expiration_date':(fake.date_between(start_date=(datetime.today() + timedelta(days=100)), end_date="+4y")).strftime('%d-%m-%Y'),
        'permit_license_number':new_permit,
        'certification_date':new_cert_date,
        'vehicle_year':new_cert_date.year,
        'base_telephone_number':fake.numerify(f"({random.randint(201, 999)})###-####"),
        'base_address':fake.street_address(),
        'vehicle_id':new_id,
        'last_update_timestamp':datetime.today().strftime('%d-%m-%Y %H:%M:%S'),
        'brand':random.choice(vehicles_update['brand'].value_counts().index),
        'vehicle_type':random.choice(vehicles_update['vehicle_type'].value_counts().index)
    })

vehicles_new = pd.DataFrame(data)

In [25]:
vehicles_new.head()

Unnamed: 0,active,vehicle_license_number,registration_name,license_type,expiration_date,permit_license_number,certification_date,vehicle_year,base_telephone_number,base_address,vehicle_id,last_update_timestamp,brand,vehicle_type
0,1,3592033,"GUERRERO,HECTOR,M",FOR HIRE VEHICLE,31-10-2028,R8TA3BMS,2024-05-15,2024,(705)964-6203,015 Phillips Key Suite 035,79l3v332e3,06-07-2025 07:17:14,Ferrari,high_end
1,1,2398329,"ISLAM,MD,MYNUL",FOR HIRE VEHICLE,17-04-2026,J6QSUX5L,2023-10-27,2023,(296)545-8260,47540 Aaron Greens,12gq4r6960,06-07-2025 07:17:14,Lamborghini,basic
2,1,8874812,"VALLEJOOLEA,L,A",FOR HIRE VEHICLE,26-01-2027,FJC40XSH,2022-11-26,2022,(416)840-9496,8836 Kim Mountain Suite 847,m3h528476b,06-07-2025 07:17:14,BMW,basic
3,1,5845323,"KIM,YOUNGHO",FOR HIRE VEHICLE,30-03-2029,UFXZO9M9,2022-12-16,2022,(582)885-2126,11058 Nicole Port,6815w699fo,06-07-2025 07:17:14,Lamborghini,premium
4,1,7429244,"PATUARY,MOHAMMAD,J",FOR HIRE VEHICLE,04-02-2026,CFAC14YD,2022-03-03,2022,(746)948-3503,66899 Ramirez Plains Suite 251,81t48h719h,06-07-2025 07:17:15,Ferrari,basic


In [26]:
def generate_random_rental_time():
    today = datetime.today().date()
    hour = random.randint(0, 21)       # 0–21
    minute = random.randint(0, 59)     # 0–59
    second = random.randint(0, 59)     # 0–59
    bounded_time = time(hour=hour, minute=minute, second=second)
    random_datetime_today = datetime.combine(today, bounded_time)
    start_date = random_datetime_today

    duration_minutes = random.randint(10, 120)
    end_date = start_date + timedelta(minutes=duration_minutes)
    start_date = start_date.strftime('%Y-%m-%d %H:%M:%S')
    end_date = end_date.strftime('%Y-%m-%d %H:%M:%S')
    return start_date, end_date

In [27]:
# transaction -> rental_end_time
exists_id = transactions['rental_id'].values
list_user_id = users_update['user_id'].values.tolist()  + (users_new['user_id']).values.tolist()
list_vehicle_id = vehicles_update['vehicle_id'].values.tolist()  + (vehicles_new['vehicle_id']).values.tolist()
list_location = locs_new['location_id'].values.tolist()

data = []

for i in range(1000):
    new_id = generate_random_alphanumeric()
    while new_id in exists_id:
        new_id = generate_random_alphanumeric(10)
    user_id = random.choice(list_user_id)
    vehicle_id = random.choice(list_vehicle_id)
    rent_time = generate_random_rental_time()
    pickup = random.choice(list_location)
    dropoff = random.choice(list_location)
    while dropoff == pickup:
        dropoff = random.choice(list_location)

    data.append({
        'rental_id':new_id,
        'user_id':user_id,
        'vehicle_id':vehicle_id,
        'rental_start_time': rent_time[0],
        'rental_end_time': rent_time[1],
        'pickup_location':pickup,
        'dropoff_location':dropoff,
        'total_amount':float(random.randint(0, 3600))
    }
    )

transactions_new = pd.DataFrame(data)

In [28]:
transactions_new

Unnamed: 0,rental_id,user_id,vehicle_id,rental_start_time,rental_end_time,pickup_location,dropoff_location,total_amount
0,l405i082k6,ce66e3a885,9aa7014596,2025-07-06 12:03:56,2025-07-06 12:35:56,7147,3555,1756.0
1,2l877pg883,b56055ca19,1bfc079bd4,2025-07-06 21:05:31,2025-07-06 21:39:31,6787,1885,3250.0
2,c5p51374y0,5e89eb1eb9,0adc7600a0,2025-07-06 18:29:12,2025-07-06 19:25:12,7156,4627,214.0
3,441s7zl609,5cb6e27dda,cb07e7de2f,2025-07-06 10:46:15,2025-07-06 11:05:15,6690,2958,2147.0
4,j5963174jb,b2d5fb734e,bcd03ffd12,2025-07-06 12:28:13,2025-07-06 14:27:13,9297,9633,3587.0
...,...,...,...,...,...,...,...,...
995,197y0f6d89,c8712806e4,b626c15389,2025-07-06 21:48:02,2025-07-06 22:30:02,9809,8147,2533.0
996,41yt8p1449,9693e31189,e28e0a2a10,2025-07-06 16:26:37,2025-07-06 16:36:37,9600,3067,3175.0
997,t31y10e255,2dcaa5bf7b,a82090f04d,2025-07-06 16:20:54,2025-07-06 18:12:54,2136,7385,270.0
998,1j434ik027,122fe1d164,b2b4b17c0b,2025-07-06 08:29:55,2025-07-06 10:12:55,7178,2081,591.0


### insert and update to MySQL

In [29]:
# query insert table
query_insert_table_n_users = generate_insert_into(users_new, 'users')
query_insert_table_n_vehicle = generate_insert_into(vehicles_new, 'vehicles')
query_insert_table_n_transaction = generate_insert_into(transactions_new, 'transactions')

In [30]:
# Load .env
load_dotenv()

# Read MySQL connection info
hostname = os.getenv("MYSQL_HOST")
port = int(os.getenv("MYSQL_PORT"))
username = os.getenv("MYSQL_USER")
password = os.getenv("MYSQL_PASSWORD")

try:
    conn = mysql.connector.connect(
        host=hostname,
        user=username,
        password=password,
        port=port
    )
    
    if conn.is_connected():
        print("[INFO] connect to mysql success")
        cursor = conn.cursor()
        cursor.execute("CREATE DATABASE IF NOT EXISTS rental_vehicle;")
        cursor.execute("USE rental_vehicle;")
        cursor.execute("SELECT DATABASE();")
        record = cursor.fetchone()
        print(f"[INFO] you're connected to {record[0]} database")

        batch_size = 500
        tables = {
            'users': users_new,
            'vehicles': vehicles_new,
            'transactions': transactions_new
        }
        insert_query = {
            'users': query_insert_table_n_users,
            'vehicles': query_insert_table_n_vehicle,
            'transactions': query_insert_table_n_transaction
        }
        
        for tbl in tables.keys():
            cursor.execute(f"SELECT COUNT(*) FROM {tbl};")
            count_data = cursor.fetchone()
            num_batches = len(tables[tbl]) // batch_size + 1
            for i in range(num_batches):
                start_idx = i * batch_size
                end_idx = ((i+1) * batch_size)
                batch_data = tables[tbl].iloc[start_idx:end_idx]
                batch_data_record = [tuple(row) for row in batch_data.to_numpy()]

                cursor.executemany(insert_query[tbl], batch_data_record)
                conn.commit()
                print(f"[INFO] insert batch {i+1}/{num_batches} success")

            cursor.execute(f"SELECT COUNT(*) FROM {tbl};")
            count_data_new = cursor.fetchone()
            print(f"[INFO] all {count_data_new[0]-count_data[0]} {tbl} data inserted successfully")

        update_tables = {
            'users': {
                'df': locals().get('target_users'),
                'query': "UPDATE users SET is_active = %s, update_at = %s WHERE user_id = %s",
                'columns': ['is_active', 'update_at', 'user_id']
            },
            'vehicles': {
                'df': locals().get('target_vehicles'),
                'query': "UPDATE vehicles SET active = %s, last_update_timestamp = %s WHERE vehicle_id = %s",
                'columns': ['active', 'last_update_timestamp', 'vehicle_id']
            },
        }

        for tbl_name, update_info in update_tables.items():
            df = update_info['df']
            if df is not None and not df.empty:
                print(f"[INFO] starting update for {tbl_name}")
                query = update_info['query']
                columns = update_info['columns']

                num_batches = len(df) // batch_size + 1
                for i in range(num_batches):
                    start_idx = i * batch_size
                    end_idx = (i+1) * batch_size
                    batch_data = df.iloc[start_idx:end_idx]

                    records = [tuple(row[col] for col in columns) for _, row in batch_data.iterrows()]

                    cursor.executemany(query, records)
                    conn.commit()
                    print(f"[INFO] update batch {i+1}/{num_batches} for {tbl_name} success")

except Error as e:
    print("Error while connecting to MySQL", e)

finally:
    if conn.is_connected():
        cursor.close()
        conn.close()
        print("MySQL connection is closed")


[INFO] connect to mysql success
[INFO] you're connected to rental_vehicle database
[INFO] insert batch 1/2 success
[INFO] insert batch 2/2 success
[INFO] all 500 users data inserted successfully
[INFO] insert batch 1/1 success
[INFO] all 50 vehicles data inserted successfully
[INFO] insert batch 1/3 success
[INFO] insert batch 2/3 success
[INFO] insert batch 3/3 success
[INFO] all 1000 transactions data inserted successfully
[INFO] starting update for users
[INFO] update batch 1/1 for users success
[INFO] starting update for vehicles
[INFO] update batch 1/1 for vehicles success
MySQL connection is closed


In [31]:
# Load .env
load_dotenv()

# Read MySQL connection info
hostname = os.getenv("MYSQL_HOST")
port = int(os.getenv("MYSQL_PORT"))
username = os.getenv("MYSQL_USER")
password = os.getenv("MYSQL_PASSWORD")

try:
    conn = mysql.connector.connect(
        host=hostname,
        user=username,
        password=password,
        port=port
    )
    
    if conn.is_connected():
        print("[INFO] connect to mysql success")
        cursor = conn.cursor()
        cursor.execute("CREATE DATABASE IF NOT EXISTS rental_vehicle;")
        cursor.execute("USE rental_vehicle;")
        cursor.execute("SELECT DATABASE();")
        record = cursor.fetchone()
        print(f"[INFO] you're connected to {record[0]} database")

        tbls = ['users', 'vehicles', 'transactions']
        for tbl in tbls:
            cursor.execute(f"SELECT COUNT(*) FROM {tbl};")
            count = cursor.fetchone()[0]
            print(f"{tbl} count: {count}")

except Error as e:
    print("Error while connecting to MySQL", e)

finally:
    if conn.is_connected():
        cursor.close()
        conn.close()
        print("MySQL connection is closed")


[INFO] connect to mysql success
[INFO] you're connected to rental_vehicle database
users count: 30500
vehicles count: 109634
transactions count: 21080
MySQL connection is closed


In [32]:
len(users)+500, len(vehicles)+50, len(transactions)+1000

(30500, 109634, 21080)

In [34]:
# Load .env
load_dotenv()

# Read MySQL connection info
hostname = os.getenv("MYSQL_HOST")
port = int(os.getenv("MYSQL_PORT"))
username = os.getenv("MYSQL_USER")
password = os.getenv("MYSQL_PASSWORD")

try:
    conn = mysql.connector.connect(
        host=hostname,
        user=username,
        password=password,
        port=port
    )
    
    if conn.is_connected():
        print("[INFO] connect to mysql success")
        cursor = conn.cursor()
        cursor.execute("CREATE DATABASE IF NOT EXISTS rental_vehicle;")
        cursor.execute("USE rental_vehicle;")
        cursor.execute("SELECT DATABASE();")
        record = cursor.fetchone()
        print(f"[INFO] you're connected to {record[0]} database")

        tbls = {
            "users": "SELECT COUNT(*) FROM users WHERE update_at >= '2025-07-06' AND update_at < '2025-07-07';",
            "vehicles": "SELECT COUNT(*) FROM vehicles WHERE STR_TO_DATE(last_update_timestamp, '%d-%m-%Y') >= '2025-07-06' AND STR_TO_DATE(last_update_timestamp, '%d-%m-%Y') < '2025-07-07';",
            "transactions": "SELECT COUNT(*) FROM transactions WHERE DATE(rental_end_time) >= '2025-07-06' AND DATE(rental_end_time) < '2025-07-07';"
        }
        for tbl in tbls.keys():
            cursor.execute(tbls[tbl])
            count = cursor.fetchone()
            print(f"{tbl} next process count: {count}")

except Error as e:
    print("Error while connecting to MySQL", e)

finally:
    if conn.is_connected():
        cursor.close()
        conn.close()
        print("MySQL connection is closed")


[INFO] connect to mysql success
[INFO] you're connected to rental_vehicle database
users next process count: (660,)
vehicles next process count: (65,)
transactions next process count: (1000,)
MySQL connection is closed
