In [1]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
    

In [5]:

!pip install pandas pyarrow numpy


Collecting pyarrow
  Downloading pyarrow-22.0.0-cp311-cp311-win_amd64.whl.metadata (3.3 kB)
Downloading pyarrow-22.0.0-cp311-cp311-win_amd64.whl (28.1 MB)
   ---------------------------------------- 0.0/28.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/28.1 MB ? eta -:--:--
   ---- ----------------------------------- 2.9/28.1 MB 15.2 MB/s eta 0:00:02
   ------- -------------------------------- 5.5/28.1 MB 14.0 MB/s eta 0:00:02
   ------------- -------------------------- 9.4/28.1 MB 15.4 MB/s eta 0:00:02
   ---------------- ----------------------- 11.8/28.1 MB 14.8 MB/s eta 0:00:02
   ---------------------- ----------------- 15.7/28.1 MB 15.5 MB/s eta 0:00:01
   ------------------------ --------------- 17.6/28.1 MB 14.4 MB/s eta 0:00:01
   --------------------------- ------------ 19.7/28.1 MB 13.5 MB/s eta 0:00:01
   ------------------------------- -------- 22.0/28.1 MB 13.3 MB/s eta 0:00:01
   ----------------------------------- ---- 24.6/28.1 MB 13.2 MB/s eta 0:00:

In [10]:
os.makedirs('dataset_creados',exist_ok = True)

In [4]:
np.random.seed(42)

In [2]:

hoy = pd.Timestamp.today()
inicio = hoy - pd.DateOffset(months=24)

currency = pd.DataFrame({
    "id": [1, 2],
    "code": ["MXN", "USD"],
    "is_base_currency": [True, False]
})
currency.to_parquet("dataset_creados/Currency.parquet")

partner = pd.DataFrame({
    "id": range(1, 41),
    "partner_type": ["customer"] * 30 + ["supplier"] * 10,
    "display_name": [f"Cliente_{i}" for i in range(1, 31)] +
                    [f"Proveedor_{i}" for i in range(1, 11)],
    "country_code": np.random.choice(["MX", "US", "PE"], 40)
})
partner.to_parquet("dataset_creados/Partner.parquet")

product = pd.DataFrame({
    "id": range(1, 11),
    "product_name": [f"Producto_{i}" for i in range(1, 11)],
    "default_price": np.random.randint(200, 2000, 10),
    "cost_per_unit": np.random.randint(100, 1200, 10)
})
product.to_parquet("dataset_creados/Product.parquet")

fechas_fx = pd.date_range(start=inicio, end=hoy, freq="D")
exchange = pd.DataFrame({
    "id": range(1, len(fechas_fx)+1),
    "currency_id": [2] * len(fechas_fx),
    "rate_date": fechas_fx,
    "rate_to_base": np.round(np.random.normal(17.5, 0.8, len(fechas_fx)), 2)
})
exchange.to_parquet("dataset_creados/ExchangeRate.parquet")

n_invoices = 300
invoice_dates = pd.to_datetime(
    np.random.choice(pd.date_range(inicio, hoy), n_invoices)
)

invoice = pd.DataFrame({
    "id": range(1, n_invoices + 1),
    "partner_id": np.random.choice(partner["id"], n_invoices),
    "currency_id": np.random.choice([1, 2], n_invoices),
    "invoice_type": np.random.choice(["out", "in"], n_invoices),
    "invoice_date": invoice_dates
})

invoice["due_date"] = invoice["invoice_date"] + pd.to_timedelta(
    np.random.randint(15, 60, n_invoices), unit="D"
)

lines = []
for inv_id in invoice["id"]:
    for _ in range(np.random.randint(1, 4)):
        p = np.random.choice(product["id"])
        price = product.loc[product["id"] == p, "default_price"].values[0]
        lines.append({
            "invoice_id": inv_id,
            "product_id": p,
            "quantity": np.random.randint(1, 10),
            "price": price
        })

invoice_line = pd.DataFrame(lines)
invoice_line["id"] = range(1, len(invoice_line) + 1)
invoice_line.to_parquet("dataset_creados/InvoiceLine.parquet")

totales = (invoice_line
           .assign(total_line=lambda df: df["quantity"] * df["price"])
           .groupby("invoice_id")["total_line"]
           .sum()
           .reset_index())

invoice = invoice.merge(totales, left_on="id", right_on="invoice_id", how="left")
invoice.rename(columns={"total_line": "total_amount"}, inplace=True)
invoice.drop(columns=["invoice_id"], inplace=True)

n_payments = 200
payment_dates = pd.to_datetime(
    np.random.choice(pd.date_range(inicio, hoy), n_payments)
)

payment = pd.DataFrame({
    "id": range(1, n_payments+1),
    "currency_id": np.random.choice([1, 2], n_payments),
    "payment_date": payment_dates,
    "amount": np.random.randint(200, 5000, n_payments),
    "method": np.random.choice(["transfer", "cash", "card"], n_payments),
})
payment.to_parquet("dataset_creados/Payment.parquet")

apps = []
for i, row in payment.iterrows():
    inv_id = np.random.choice(invoice["id"])
    inv_monto = invoice.loc[invoice["id"] == inv_id, "total_amount"].values[0]
    allocated = np.random.randint(50, min(row["amount"], inv_monto))
    apps.append({
        "payment_id": row["id"],
        "invoice_id": inv_id,
        "allocated_amount": allocated
    })

payment_app = pd.DataFrame(apps)
payment_app["id"] = range(1, len(payment_app) + 1)
payment_app.to_parquet("dataset_creados/PaymentApplication.parquet")

pagos = (payment_app.groupby("invoice_id")["allocated_amount"]
         .sum()
         .reset_index()
         .rename(columns={"allocated_amount": "amount_paid"}))

invoice = invoice.merge(pagos, left_on="id", right_on="invoice_id", how="left")
invoice["amount_paid"] = invoice["amount_paid"].fillna(0)
invoice["remaining_balance"] = invoice["total_amount"] - invoice["amount_paid"]

invoice["payment_state"] = invoice.apply(
    lambda r: "paid" if r["remaining_balance"] == 0
    else ("unpaid" if r["amount_paid"] == 0 else "partial"),
    axis=1
)

invoice.loc[0, "due_date"] = invoice.loc[0, "invoice_date"] - pd.Timedelta(days=5)
payment_app.loc[0, "allocated_amount"] = invoice.loc[0, "total_amount"] + 100

entries = pd.DataFrame({"entry_id": range(1, 101)})

lines = []
for e in entries["entry_id"]:
    amount = np.random.randint(500, 5000)
    lines.append({"entry_id": e, "account_code": "1010", "debit": amount, "credit": 0})
    lines.append({"entry_id": e, "account_code": "4000", "debit": 0, "credit": amount})

lines.append({"entry_id": 1, "account_code": "9999", "debit": 5000, "credit": 0})

journal = pd.DataFrame(lines)
journal.to_parquet("dataset_creados/JournalLine.parquet")

invoice.to_parquet("dataset_creados/Invoice.parquet")





INSERCIÃ“N DE ERRORES 

In [None]:
invoice.loc[0, "due_date"] = invoice.loc[0, "invoice_date"] - pd.Timedelta(days=5)
payment_app.loc[0, "allocated_amount"] = 999999



journal.loc[0, "credit"] = journal.loc[0, "debit"] - 100



#correcion de errores
print()


In [6]:
print(payment_app.head())

   payment_id  invoice_id  allocated_amount  id
0           1           2            999999   1
1           2         230               905   2
2           3         190              2319   3
3           4         163              3261   4
4           5         132              3095   5
