In [4]:
import pandas as pd
from faker import Faker

# Initialize faker
fake = Faker()

# Number of rows
num_rows = 500

# Generate the data
data = {
    "loan_id": range(1, num_rows + 1),
    "applicant_name": [fake.name() for _ in range(num_rows)],
    "applicant_age": [
        fake.random_int(min=20, max=60) for _ in range(num_rows)
    ],
    "loan_amount": [fake.random_number(digits=6) for _ in range(num_rows)],
    "loan_duration": [
        fake.random_int(min=12, max=60) for _ in range(num_rows)
    ],  # 12 to 60 months
    "interest_rate": [
        fake.pyfloat(
            left_digits=2,
            right_digits=2,
            positive=True,
            min_value=2.0,
            max_value=15.0,
        )
        for _ in range(num_rows)
    ],  # 2% to 15%
    "application_date": [fake.date_this_decade() for _ in range(num_rows)],
    "loan_type": [
        fake.random_element(elements=("Home", "Auto", "Personal"))
        for _ in range(num_rows)
    ],
}

# Convert to DataFrame
df_loans = pd.DataFrame(data)

# Display a sample of the data
print(df_loans.head())
df_loans.to_csv("loans_dataset.csv", index=False)

   loan_id     applicant_name  applicant_age  loan_amount  loan_duration  \
0        1     Cynthia Gamble             39       368128             52   
1        2      Clinton Munoz             54       277424             55   
2        3   Felicia Matthews             54       411966             15   
3        4     Holly Browning             59       205487             36   
4        5  Felicia Carpenter             50       109983             29   

   interest_rate application_date loan_type  
0           7.90       2023-05-12      Auto  
1           4.60       2021-06-18      Home  
2          14.15       2022-08-26  Personal  
3           7.14       2020-03-16      Home  
4           2.00       2023-05-03  Personal  


In [3]:
from faker import Faker
from faker.providers import DynamicProvider

medical_professions_provider = DynamicProvider(
    provider_name="medical_profession",
    elements=["dr.", "doctor", "nurse", "surgeon", "clerk"],
)

fake = Faker()

# then add new provider to faker instance
fake.add_provider(medical_professions_provider)


fake.medical_profession()

'clerk'

In [4]:
from faker import Faker
import random
import pandas as pd
from datetime import datetime, timedelta

fake = Faker()

# Number of records to generate
N = 1000

# Generate data
data = {
    "id": [i for i in range(1, N + 1)],
    "customer_id": [
        random.randint(1, 100) for _ in range(N)
    ],  # Assuming 100 different customers
    "waiter_id": [
        random.randint(1, 10) for _ in range(N)
    ],  # Assuming 10 different waiters
    "start_ts": [],
    "end_ts": [],
    "di": [],
}

# Generating random timestamps for start_ts and end_ts, and converting them to di (date of order in '%Y%m%d' format)
for _ in range(N):
    start = fake.date_time_this_month()
    end = start + timedelta(minutes=fake.random_int(min=1, max=1440))
    data["start_ts"].append(start.strftime("%Y-%m-%dT%H:%M:%S"))
    data["end_ts"].append(end.strftime("%Y-%m-%dT%H:%M:%S"))
    data["di"].append(int(start.strftime("%Y%m%d")))

# Convert to DataFrame
df = pd.DataFrame(data)

print(df.head())
df.to_csv(
    "../sql-unit-test-generator/sqlmesh_example/test_data_orders.csv",
    index=False,
)

   id  customer_id  waiter_id             start_ts               end_ts  \
0   1           99          3  2023-10-19T10:42:15  2023-10-20T06:12:15   
1   2           14          5  2023-10-11T23:56:38  2023-10-12T00:49:38   
2   3           96          8  2023-10-17T21:04:39  2023-10-18T05:02:39   
3   4           61          5  2023-10-06T06:17:55  2023-10-07T04:08:55   
4   5           21          6  2023-10-17T07:56:00  2023-10-17T16:01:00   

         di  
0  20231019  
1  20231011  
2  20231017  
3  20231006  
4  20231017  
