In [108]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

In [109]:
#set a seed
np.random.seed(42)
random.seed(42)

In [110]:
#generate random dates
def random_dates(start_date, end_date, n=25000):
    date_range = (end_date - start_date).days
    random_dates = [
        start_date + timedelta(
            days=random.randint(0, date_range),
            hours=random.randint(0, 23),
            minutes=random.randint(0, 59),
            seconds=random.randint(0, 59),
        )
        for _ in range(n)
    ]
    return random_dates

In [111]:
#generate unique random customer IDs
def generate_customer_ids(n_customers=50000):
    return np.random.choice(np.arange(10000, 99999), size=n_customers, replace=True).astype(str) #categorical variable

In [112]:
# generate unique random products_ids with 6 characters between numbers and letters
def generate_prod_id(n_products=5000):
    return [''.join(random.choices('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ', k=6)) for _ in range(n_products)]

In [113]:
#generate the DataFrame
def generate_dataframe(rows=500000, n_products=5000):

    #invoice number, product, and quantity
    invoice_numbers = np.arange(100000, 100000 + rows).astype(str)
    products = np.random.choice(generate_prod_id(n_products=n_products), size=rows, replace=True)
    quantities = np.random.randint(1, 11, size=rows)

    #customers
    customer_ids = generate_customer_ids(n_customers=rows)

    #random dates between 2023-01-01 and 2023-11-11
    start_date = datetime(2023, 1, 1)
    end_date = datetime(2023, 11, 11)
    invoice_dates = random_dates(start_date, end_date, n=rows)

    #prices
    min_price = 11.0
    max_price = 524.6
    unique_products = np.unique(products)
    prices = np.random.uniform(min_price, max_price, size=len(unique_products)).round(2)

    #map prices to each product in the DataFrame
    product_prices = dict(zip(unique_products, prices))
    df = pd.DataFrame({
        'invoice': invoice_numbers,
        'product': products,
        'quantity': quantities,
        'date': invoice_dates,
        'customer': np.random.choice(customer_ids, size=rows, replace=True),
        'price': pd.Series(products).map(product_prices)
    })

    return df

In [114]:
#generate the DataFrame
df = generate_dataframe()

df.head()

Unnamed: 0,invoice,product,quantity,date,customer,price
0,100000,GGOPG8,3,2023-06-13 18:58:00,85982,57.1
1,100001,J1YVL8,9,2023-03-10 21:24:08,97219,415.19
2,100002,B6UZ55,8,2023-11-08 14:24:05,20537,218.91
3,100003,EJY7B9,7,2023-03-03 12:24:08,41181,271.24
4,100004,9YBKSV,2,2023-10-04 12:27:00,77067,206.67


In [115]:
df.describe()

Unnamed: 0,quantity,price
count,500000.0,500000.0
mean,5.494704,267.440456
std,2.871914,148.662243
min,1.0,11.34
25%,3.0,139.02
50%,5.0,268.29
75%,8.0,396.51
max,10.0,524.44


In [116]:
df.describe(include='object')

Unnamed: 0,invoice,product,customer
count,500000,500000,500000
unique,500000,5000,87274
top,100000,4DTC02,54177
freq,1,134,28


In [117]:
def count_unique_values(df):
    unique_counts = df.nunique()
    return unique_counts

In [118]:
unique_counts = count_unique_values(df)
unique_counts

invoice     500000
product       5000
quantity        10
date        495426
customer     87274
price         4785
dtype: int64

In [122]:
df.to_csv('synthetic_data.csv', index=False)