In [0]:
# monthly_html
import os
import hashlib
from datetime import datetime, date

import pandas as pd
from pyspark.sql import functions as F, types as T

# ---- widgets ----
dbutils.widgets.text("output_base_path", "abfss://output@pvcstor.dfs.core.windows.net/lti_upload/contracts")
dbutils.widgets.text("doc_type", "CONTRACT_STATEMENT_MONTHLY_HTML")
dbutils.widgets.text("month_filter", "")  # optional: "2024-02-01" (YYYY-MM-01)

OUTPUT_BASE = dbutils.widgets.get("output_base_path").rstrip("/")
DOC_TYPE = dbutils.widgets.get("doc_type")
MONTH_FILTER = dbutils.widgets.get("month_filter").strip()

print("OUTPUT_BASE:", OUTPUT_BASE)
print("DOC_TYPE:", DOC_TYPE)
print("MONTH_FILTER:", MONTH_FILTER or "(none)")

# ---- source: contract-months to generate ----
summary = spark.table("tp_finance.gold.fact_contract_monthly_summary").select(
    "customer_id", "contract_id", "month", "currency",
    "total_drawn", "total_repaid", "net_movement", "closing_balance",
    "facilities", "mismatched_rows"
)

if MONTH_FILTER:
    summary = summary.where(F.col("month") == F.to_date(F.lit(MONTH_FILTER)))

targets = summary.orderBy("customer_id", "contract_id", "month").collect()
print(f"Targets to generate: {len(targets)}")

lines_tbl = "tp_finance.gold.v_contract_statement_lines"

def month_end(m: date) -> date:
    if m.month == 12:
        nxt = date(m.year + 1, 1, 1)
    else:
        nxt = date(m.year, m.month + 1, 1)
    return date.fromordinal(nxt.toordinal() - 1)

def sha256_text(s: str) -> str:
    return hashlib.sha256(s.encode("utf-8")).hexdigest()

def html_escape(x: str) -> str:
    return (x.replace("&", "&amp;")
             .replace("<", "&lt;")
             .replace(">", "&gt;")
             .replace('"', "&quot;")
             .replace("'", "&#39;"))

def render_html(customer_id: str, contract_id: str, m: date, currency: str,
                header_metrics: dict, lines_pdf: pd.DataFrame) -> str:
    # Prepare table rows
    def fmt_num(v):
        if pd.isna(v):
            return ""
        try:
            return f"{float(v):,.2f}"
        except Exception:
            return str(v)

    rows_html = []
    for _, r in lines_pdf.iterrows():
        rows_html.append(f"""
          <tr>
            <td>{html_escape(str(r.get("facility_id","")))}</td>
            <td class="num">{fmt_num(r.get("opening_balance"))}</td>
            <td class="num">{fmt_num(r.get("drawn_this_month"))}</td>
            <td class="num">{fmt_num(r.get("repaid_this_month"))}</td>
            <td class="num">{fmt_num(r.get("closing_balance"))}</td>
            <td>{html_escape(str(r.get("balance_source","")))}</td>
            <td>{'Y' if bool(r.get("is_mismatch")) else ''}</td>
            <td class="num">{fmt_num(r.get("balance_diff"))}</td>
          </tr>
        """)

    period_start = m
    period_end = month_end(m)

    # Simple clean styling (Power BI / finance friendly)
    html = f"""<!doctype html>
<html>
<head>
  <meta charset="utf-8" />
  <title>Contract Statement - {html_escape(contract_id)} - {m}</title>
  <style>
    body {{ font-family: Arial, sans-serif; font-size: 12px; color: #111; }}
    .container {{ max-width: 1000px; margin: 24px auto; }}
    h1 {{ font-size: 20px; margin: 0 0 8px 0; }}
    .meta {{ margin: 8px 0 16px 0; }}
    .meta div {{ margin: 2px 0; }}
    table {{ border-collapse: collapse; width: 100%; margin-top: 12px; }}
    th, td {{ border: 1px solid #ddd; padding: 6px 8px; }}
    th {{ background: #f3f3f3; text-align: left; }}
    td.num {{ text-align: right; font-variant-numeric: tabular-nums; }}
    .summary {{ display: grid; grid-template-columns: 1fr 1fr; gap: 6px 24px; margin-top: 10px; }}
    .pill {{ display: inline-block; padding: 2px 8px; border-radius: 999px; background: #f3f3f3; }}
    .warn {{ background: #ffe8e8; }}
    .footer {{ margin-top: 18px; color: #666; font-size: 11px; }}
  </style>
</head>
<body>
  <div class="container">
    <h1>Contract Statement (Monthly)</h1>

    <div class="meta">
      <div><b>Customer:</b> {html_escape(customer_id)}</div>
      <div><b>Contract:</b> {html_escape(contract_id)}</div>
      <div><b>Statement period:</b> {period_start} to {period_end}</div>
      <div><b>Currency:</b> {html_escape(currency or "")}</div>
    </div>

    <div class="summary">
      <div><b>Total drawn:</b> <span class="pill">{header_metrics.get("total_drawn","")}</span></div>
      <div><b>Total repaid:</b> <span class="pill">{header_metrics.get("total_repaid","")}</span></div>
      <div><b>Net movement:</b> <span class="pill">{header_metrics.get("net_movement","")}</span></div>
      <div><b>Closing balance:</b> <span class="pill">{header_metrics.get("closing_balance","")}</span></div>
      <div><b>Facilities:</b> <span class="pill">{header_metrics.get("facilities","")}</span></div>
      <div><b>Mismatched rows:</b> <span class="pill {'warn' if header_metrics.get('mismatched_rows',0) else ''}">{header_metrics.get("mismatched_rows","")}</span></div>
    </div>

    <table>
      <thead>
        <tr>
          <th>Facility</th>
          <th>Opening</th>
          <th>Drawn</th>
          <th>Repaid</th>
          <th>Closing</th>
          <th>Source</th>
          <th>Mismatch</th>
          <th>Diff</th>
        </tr>
      </thead>
      <tbody>
        {''.join(rows_html)}
      </tbody>
    </table>

    <div class="footer">
      Generated: {datetime.utcnow().isoformat()}Z
    </div>
  </div>
</body>
</html>
"""
    return html

# ---- generate ----
manifest_rows = []
now_ts = datetime.utcnow()

for t in targets:
    customer_id = t["customer_id"]
    contract_id = t["contract_id"]
    m = t["month"]
    currency = t["currency"]

    ym = f"{m.year:04d}-{m.month:02d}"
    out_dir = f"{OUTPUT_BASE}/{contract_id}"
    out_path = f"{out_dir}/statement_{ym}.html"

    try:
        # pull statement lines
        df_lines = (
            spark.table(lines_tbl)
                 .where((F.col("customer_id")==customer_id) & (F.col("contract_id")==contract_id) & (F.col("month")==F.lit(m)))
                 .orderBy("facility_id")
        )

        pdf = df_lines.toPandas()
        if pdf.empty:
            raise ValueError("No statement lines found for contract/month")

        header_metrics = {
            "total_drawn": f"{float(t['total_drawn']):,.2f}" if t["total_drawn"] is not None else "",
            "total_repaid": f"{float(t['total_repaid']):,.2f}" if t["total_repaid"] is not None else "",
            "net_movement": f"{float(t['net_movement']):,.2f}" if t["net_movement"] is not None else "",
            "closing_balance": f"{float(t['closing_balance']):,.2f}" if t["closing_balance"] is not None else "",
            "facilities": int(t["facilities"]) if t["facilities"] is not None else "",
            "mismatched_rows": int(t["mismatched_rows"]) if t["mismatched_rows"] is not None else 0,
        }

        html = render_html(customer_id, contract_id, m, currency, header_metrics, pdf)

        dbutils.fs.mkdirs(out_dir)
        dbutils.fs.put(out_path, html, True)

        manifest_rows.append((
            customer_id,
            contract_id,
            DOC_TYPE,
            m,
            month_end(m),
            out_path,
            sha256_text(html),
            "SUCCESS",
            None,
            now_ts
        ))
    except Exception as e:
        manifest_rows.append((
            customer_id,
            contract_id,
            DOC_TYPE,
            m,
            month_end(m),
            out_path,
            None,
            "FAILED",
            str(e)[:2000],
            now_ts
        ))

# ---- write manifest ----
schema = T.StructType([
    T.StructField("customer_id", T.StringType()),
    T.StructField("contract_id", T.StringType()),
    T.StructField("doc_type", T.StringType()),
    T.StructField("period_start", T.DateType()),
    T.StructField("period_end", T.DateType()),
    T.StructField("output_path", T.StringType()),
    T.StructField("file_sha256", T.StringType()),
    T.StructField("status", T.StringType()),
    T.StructField("error", T.StringType()),
    T.StructField("generated_ts", T.TimestampType()),
])

manifest_df = spark.createDataFrame(manifest_rows, schema=schema)
manifest_df.write.mode("append").saveAsTable("tp_finance.audit.generated_documents")

display(manifest_df)
print("Done.")

In [0]:
# NEED to use proper clsuter - archived for now
import os
import hashlib
from datetime import datetime, date

import pandas as pd
from pyspark.sql import functions as F, types as T

# ---- widgets ----
dbutils.widgets.text("output_base_path", "abfss://output@pvcstor.dfs.core.windows.net/lti_upload/contracts")
dbutils.widgets.text("doc_type", "CONTRACT_STATEMENT_MONTHLY")
dbutils.widgets.text("month_filter", "2024-02-01")  # optional: "2024-02-01" (YYYY-MM-01)

OUTPUT_BASE = dbutils.widgets.get("output_base_path").rstrip("/")
DOC_TYPE = dbutils.widgets.get("doc_type")
MONTH_FILTER = dbutils.widgets.get("month_filter").strip()

print("OUTPUT_BASE:", OUTPUT_BASE)
print("DOC_TYPE:", DOC_TYPE)
print("MONTH_FILTER:", MONTH_FILTER or "(none)")

# ---- source: contract-months to generate ----
summary = spark.table("tp_finance.gold.fact_contract_monthly_summary").select(
    "customer_id", "contract_id", "month", "currency"
)

if MONTH_FILTER:
    summary = summary.where(F.col("month") == F.to_date(F.lit(MONTH_FILTER)))

targets = summary.orderBy("customer_id", "contract_id", "month").collect()
print(f"Targets to generate: {len(targets)}")

# ---- statement lines source ----
lines_tbl = "tp_finance.gold.v_contract_statement_lines"  # uses facility-month lines

# ---- PDF generator (reportlab) ----
def render_pdf_contract_month(pdf_path_local: str, header: dict, lines_pdf: pd.DataFrame) -> None:
    from reportlab.lib.pagesizes import letter
    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
    from reportlab.lib.styles import getSampleStyleSheet
    from reportlab.lib.units import inch

    styles = getSampleStyleSheet()
    doc = SimpleDocTemplate(pdf_path_local, pagesize=letter, rightMargin=36, leftMargin=36, topMargin=36, bottomMargin=36)

    story = []
    story.append(Paragraph(f"Contract Statement (Monthly)", styles["Title"]))
    story.append(Spacer(1, 0.2 * inch))

    story.append(Paragraph(f"Customer: {header['customer_id']}", styles["Normal"]))
    story.append(Paragraph(f"Contract: {header['contract_id']}", styles["Normal"]))
    story.append(Paragraph(f"Statement month: {header['month']}", styles["Normal"]))
    story.append(Paragraph(f"Currency: {header.get('currency','')}", styles["Normal"]))
    story.append(Spacer(1, 0.2 * inch))

    # Table
    # Keep it simple: facility, opening, drawn, repaid, closing, flags
    table_cols = [
        "facility_id",
        "opening_balance",
        "drawn_this_month",
        "repaid_this_month",
        "closing_balance",
        "balance_source",
        "is_mismatch",
        "balance_diff",
    ]

    display = lines_pdf[table_cols].copy()

    # Format numeric values
    for c in ["opening_balance", "drawn_this_month", "repaid_this_month", "closing_balance", "balance_diff"]:
        display[c] = display[c].apply(lambda x: "" if pd.isna(x) else f"{float(x):,.2f}")

    display["is_mismatch"] = display["is_mismatch"].apply(lambda x: "Y" if bool(x) else "")

    data = [table_cols] + display.values.tolist()

    t = Table(data, repeatRows=1)
    t.setStyle(TableStyle([
        ("GRID", (0,0), (-1,-1), 0.25, (0,0,0)),
        ("BACKGROUND", (0,0), (-1,0), (0.9,0.9,0.9)),
        ("FONTNAME", (0,0), (-1,0), "Helvetica-Bold"),
        ("FONTSIZE", (0,0), (-1,-1), 8),
        ("VALIGN", (0,0), (-1,-1), "TOP"),
    ]))

    story.append(t)
    story.append(Spacer(1, 0.2 * inch))

    doc.build(story)

def sha256_file(local_path: str) -> str:
    h = hashlib.sha256()
    with open(local_path, "rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()

# ---- generate docs ----
manifest_rows = []
now_ts = datetime.utcnow()

for t in targets:
    customer_id = t["customer_id"]
    contract_id = t["contract_id"]
    month = t["month"]  # date
    currency = t["currency"]

    # period_start/end: month boundaries (end is next month start - 1 day, for display)
    period_start = month
    # compute end-of-month in python without extra deps
    if month.month == 12:
        next_month = date(month.year + 1, 1, 1)
    else:
        next_month = date(month.year, month.month + 1, 1)
    period_end = date.fromordinal(next_month.toordinal() - 1)

    # output path
    ym = f"{month.year:04d}-{month.month:02d}"
    out_dir = f"{OUTPUT_BASE}/{contract_id}"
    out_path = f"{out_dir}/statement_{ym}.pdf"

    # local temp path
    local_pdf = f"/tmp/{contract_id}_statement_{ym}.pdf"

    try:
        # pull statement lines for this contract+month
        df = (
            spark.table(lines_tbl)
                 .where((F.col("customer_id")==customer_id) & (F.col("contract_id")==contract_id) & (F.col("month")==F.lit(month)))
                 .orderBy("facility_id")
        )

        # collect to pandas (safe for your scale; later we can stream-per-contract)
        pdf = df.toPandas()
        if pdf.empty:
            raise ValueError("No statement lines found for contract/month in gold view")

        # ensure required columns exist
        required = {"facility_id","opening_balance","drawn_this_month","repaid_this_month","closing_balance","balance_source","is_mismatch","balance_diff"}
        missing = required - set(pdf.columns)
        if missing:
            raise ValueError(f"Missing columns in statement lines: {sorted(missing)}")

        # render pdf
        render_pdf_contract_month(
            local_pdf,
            header={"customer_id": customer_id, "contract_id": contract_id, "month": str(month), "currency": currency},
            lines_pdf=pdf
        )

        # ensure output dir exists
        dbutils.fs.mkdirs(out_dir)

        # copy to ADLS
        dbutils.fs.cp(f"file:{local_pdf}", out_path, True)

        # hash
        file_hash = sha256_file(local_pdf)

        manifest_rows.append((
            customer_id,
            contract_id,
            DOC_TYPE,
            period_start,
            period_end,
            out_path,
            file_hash,
            "SUCCESS",
            None,
            now_ts
        ))
    except Exception as e:
        manifest_rows.append((
            customer_id,
            contract_id,
            DOC_TYPE,
            period_start,
            period_end,
            out_path,
            None,
            "FAILED",
            str(e)[:2000],
            now_ts
        ))
    finally:
        # cleanup local file
        try:
            if os.path.exists(local_pdf):
                os.remove(local_pdf)
        except Exception:
            pass

# ---- write manifest ----
schema = T.StructType([
    T.StructField("customer_id", T.StringType()),
    T.StructField("contract_id", T.StringType()),
    T.StructField("doc_type", T.StringType()),
    T.StructField("period_start", T.DateType()),
    T.StructField("period_end", T.DateType()),
    T.StructField("output_path", T.StringType()),
    T.StructField("file_sha256", T.StringType()),
    T.StructField("status", T.StringType()),
    T.StructField("error", T.StringType()),
    T.StructField("generated_ts", T.TimestampType()),
])

manifest_df = spark.createDataFrame(manifest_rows, schema=schema)
manifest_df.write.mode("append").saveAsTable("tp_finance.audit.generated_documents")

display(manifest_df)
print("Done.")