In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)

# MEDIUM DATASET SIZE
num_customers = 800
num_transactions = 8000

# 1. Generate Customers
customer_ids = np.arange(1, num_customers + 1)

customers = pd.DataFrame({
    "customer_id": customer_ids,
    "age": np.random.randint(18, 65, num_customers),
    "gender": np.random.choice(["Male", "Female"], num_customers),
    "city": np.random.choice(["Bangalore", "Mumbai", "Delhi", "Chennai", "Hyderabad", "Kolkata", "Pune"], num_customers),
    "account_type": np.random.choice(["Savings", "Current", "Salary"], num_customers),
    "tenure_years": np.random.randint(0, 15, num_customers)
})

# 2. Generate Products
products = pd.DataFrame({
    "customer_id": customer_ids,
    "credit_card": np.random.choice(["Y", "N"], num_customers, p=[0.55, 0.45]),
    "loan_amount": np.random.choice([0, 50000, 100000, 150000, 200000],
                                    num_customers, p=[0.50, 0.20, 0.15, 0.10, 0.05]),
    "insurance": np.random.choice(["Y", "N"], num_customers, p=[0.30, 0.70])
})

# 3. Generate Transactions
txn_ids = np.arange(1, num_transactions + 1)

transactions = pd.DataFrame({
    "txn_id": txn_ids,
    "customer_id": np.random.choice(customer_ids, num_transactions),
    "txn_date": pd.to_datetime(np.random.choice(pd.date_range("2024-01-01", "2024-03-31"), num_transactions)),
    "amount": np.random.randint(200, 25000, num_transactions),
    "category": np.random.choice(["POS", "Online", "ATM"], num_transactions, p=[0.50, 0.40, 0.10])
})

# SAVE THE FILES
customers.to_csv("customers_medium.csv", index=False)
products.to_csv("products_medium.csv", index=False)
transactions.to_csv("transactions_medium.csv", index=False)

customers.head(), products.head(), transactions.head()


(   customer_id  age  gender       city account_type  tenure_years
 0            1   56    Male    Kolkata      Savings             6
 1            2   46  Female  Hyderabad      Current             8
 2            3   32  Female  Bangalore       Salary             9
 3            4   60    Male     Mumbai       Salary             4
 4            5   25  Female    Chennai      Current             8,
    customer_id credit_card  loan_amount insurance
 0            1           Y        50000         N
 1            2           N        50000         N
 2            3           Y            0         Y
 3            4           N            0         N
 4            5           N       100000         N,
    txn_id  customer_id   txn_date  amount category
 0       1          611 2024-01-27    5158   Online
 1       2          574 2024-01-28    4025      POS
 2       3          286 2024-03-22   17800      POS
 3       4          534 2024-03-20   15000      POS
 4       5          459 2024-0

In [2]:
from google.colab import files

files.download("customers_medium.csv")
files.download("products_medium.csv")
files.download("transactions_medium.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>