In [1]:
import os
from pathlib import Path

print("cwd:", os.getcwd())
print("cwd exists:", Path(os.getcwd()).exists())
print("files in cwd:", list(Path(os.getcwd()).iterdir())[:10])


cwd: c:\Projects\hotel-refund-intelligence\notebooks
cwd exists: True
files in cwd: [WindowsPath('c:/Projects/hotel-refund-intelligence/notebooks/01_ingestion_profiling.ipynb'), WindowsPath('c:/Projects/hotel-refund-intelligence/notebooks/02_cleaning_validation.ipynb'), WindowsPath('c:/Projects/hotel-refund-intelligence/notebooks/03_analysis_storytelling.ipynb')]


In [2]:
from pathlib import Path
import os

# project root = parent of notebooks folder
BASE_DIR = Path.cwd()
if BASE_DIR.name == "notebooks":
    BASE_DIR = BASE_DIR.parent

# If you run notebook from elsewhere, fallback: search upward for README.md
if not (BASE_DIR / "README.md").exists():
    for p in Path.cwd().parents:
        if (p / "README.md").exists():
            BASE_DIR = p
            break

DATA_RAW = BASE_DIR / "data" / "raw"
DATA_INTERIM = BASE_DIR / "data" / "interim"
DATA_PROCESSED = BASE_DIR / "data" / "processed"
FIG_DIR = BASE_DIR / "reports" / "figures"

print("✅ BASE_DIR set to:", BASE_DIR)
print("✅ DATA_RAW:", DATA_RAW)


✅ BASE_DIR set to: c:\Projects\hotel-refund-intelligence
✅ DATA_RAW: c:\Projects\hotel-refund-intelligence\data\raw


In [4]:
import sys
print(sys.executable)


c:\Projects\hotel-refund-intelligence\.venv\Scripts\python.exe


In [5]:
from pathlib import Path
Path.cwd()


WindowsPath('c:/Projects/hotel-refund-intelligence/notebooks')

In [6]:
from pathlib import Path
import pandas as pd
import numpy as np

# Notebook lives in /notebooks → project root is parent
PROJECT_ROOT = Path.cwd().parent

INTERIM_PATH = PROJECT_ROOT / "data" / "interim" / "combined_raw_with_metadata.parquet"

INTERIM_PATH, INTERIM_PATH.exists()


(WindowsPath('c:/Projects/hotel-refund-intelligence/data/interim/combined_raw_with_metadata.parquet'),
 True)

In [7]:
import pandas as pd
import numpy as np

df = pd.read_parquet(INTERIM_PATH)
df.shape, df.columns[:20]


((133, 56),
 Index(['IS_INTERNAL_YN', 'INTERNAL_DEBIT', 'INTERNAL_CREDIT', 'FIRST',
        'FIRST_DEBIT', 'FIRST_CREDIT', 'SECOND', 'SECOND_DEBIT',
        'SECOND_CREDIT', 'THIRD', 'THIRD_DEBIT', 'THIRD_CREDIT', 'EXP_DATE',
        'RECEIPT_NO', 'GUEST_FULL_NAME', 'TARGET_RESORT', 'TRX_DESC',
        'MARKET_CODE', 'BUSINESS_FORMAT_DATE', 'BUSINESS_TIME'],
       dtype='str'))

In [8]:
unnamed_cols = [c for c in df.columns if str(c).lower().startswith("unnamed:")]
df1 = df.drop(columns=unnamed_cols, errors="ignore")
print("Dropped:", unnamed_cols)
df1.shape


Dropped: ['Unnamed: 43', 'Unnamed: 44', 'Unnamed: 45', 'Unnamed: 46', 'Unnamed: 47', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50', 'Unnamed: 51', 'Unnamed: 52']


(133, 46)

In [9]:
for c in ["site", "file_month"]:
    if c not in df1.columns:
        raise ValueError(f"Missing required column: {c}")

df1["site"] = df1["site"].astype("string").str.strip()
df1["site"] = df1["site"].replace({"Newheaven": "Newhaven", "newheaven": "Newhaven", "newhaven": "Newhaven"})

df1["site"].value_counts(dropna=False)



site
Brighton    67
Newhaven    66
Name: count, dtype: int64[pyarrow]

In [10]:
if "BUSINESS_FORMAT_DATE" not in df1.columns:
    raise ValueError("Missing BUSINESS_FORMAT_DATE column required for refund indicator rule.")

df1["is_refund"] = df1["BUSINESS_FORMAT_DATE"].astype("string").str.contains("refund", case=False, na=False)
refunds = df1[df1["is_refund"]].copy()

refunds.shape, refunds["is_refund"].value_counts()



((65, 47),
 is_refund
 True    65
 Name: count, dtype: Int64)

In [11]:
refunds = df1[df1["is_refund"]].copy()
refunds.shape


(65, 47)

In [12]:
def parse_dt(s: pd.Series) -> pd.Series:
    return pd.to_datetime(s.astype("string"), errors="coerce", dayfirst=True)

date1 = parse_dt(refunds.get("BUSINESS_DATE"))
date2 = parse_dt(refunds.get("BUSINESS_TIME"))
date3 = parse_dt(refunds.get("BUSINESS_FORMAT_DATE"))

refunds["txn_datetime"] = date1
refunds.loc[refunds["txn_datetime"].isna(), "txn_datetime"] = date2
refunds.loc[refunds["txn_datetime"].isna(), "txn_datetime"] = date3

refunds["txn_date"] = refunds["txn_datetime"].dt.date
refunds["txn_datetime"].isna().mean()




  return pd.to_datetime(s.astype("string"), errors="coerce", dayfirst=True)


np.float64(0.0)

In [13]:
refunds["txn_datetime"].isna().mean(), refunds["txn_date"].isna().mean()


(np.float64(0.0), np.float64(0.0))

In [14]:
if "ROOM" not in refunds.columns:
    raise ValueError("Missing ROOM column required for refund amount rule.")

refunds["refund_amount"] = pd.to_numeric(refunds["ROOM"], errors="coerce").abs()

before = len(refunds)
refunds2 = refunds[refunds["refund_amount"].isna() | (refunds["refund_amount"] <= 1000)].copy()
after = len(refunds2)

print("Before:", before, "After:", after, "Removed:", before - after)
print("Count > 1000 (must be 0):", int((refunds2["refund_amount"] > 1000).sum()))



Before: 65 After: 63 Removed: 2
Count > 1000 (must be 0): 0


In [15]:
raw_rows = len(df)
refund_rows = len(refunds)
analysis_rows = len(refunds2)

recon = pd.DataFrame([
    {"stage":"raw_combined", "rows": raw_rows},
    {"stage":"refund_indicator_true", "rows": refund_rows},
    {"stage":"after_amount_outlier_filter", "rows": analysis_rows},
])

recon


Unnamed: 0,stage,rows
0,raw_combined,133
1,refund_indicator_true,65
2,after_amount_outlier_filter,63


In [16]:
amt = refunds2["refund_amount"].dropna()

sanity = {
    "min": float(amt.min()) if len(amt) else None,
    "p25": float(amt.quantile(0.25)) if len(amt) else None,
    "p50": float(amt.quantile(0.50)) if len(amt) else None,
    "p75": float(amt.quantile(0.75)) if len(amt) else None,
    "p95": float(amt.quantile(0.95)) if len(amt) else None,
    "max": float(amt.max()) if len(amt) else None,
    "count_gt_1000": int((refunds2["refund_amount"] > 1000).sum()),
}
sanity


{'min': 6.0,
 'p25': 10.99,
 'p50': 26.99,
 'p75': 65.99,
 'p95': 150.0,
 'max': 265.97,
 'count_gt_1000': 0}

In [17]:
null_audit = pd.DataFrame({
    "field": ["txn_date", "refund_amount", "site"],
    "null_rate": [
        refunds2["txn_date"].isna().mean(),
        refunds2["refund_amount"].isna().mean(),
        refunds2["site"].isna().mean(),
    ]
})

null_audit


Unnamed: 0,field,null_rate
0,txn_date,0.0
1,refund_amount,0.0
2,site,0.0


In [18]:
key_cols = ["site", "txn_date", "RECEIPT_NO", "refund_amount"]
missing_keys = [c for c in key_cols if c not in refunds2.columns]
if missing_keys:
    raise ValueError(f"Missing key columns needed for duplicate check: {missing_keys}")

dup_mask = refunds2.duplicated(subset=key_cols, keep=False)
dups = refunds2[dup_mask].sort_values(key_cols)

print("Duplicate rows:", len(dups))
dups.head(20)


Duplicate rows: 28


Unnamed: 0,IS_INTERNAL_YN,INTERNAL_DEBIT,INTERNAL_CREDIT,FIRST,FIRST_DEBIT,FIRST_CREDIT,SECOND,SECOND_DEBIT,SECOND_CREDIT,THIRD,...,CASH_ID_USER_NAME,PRINT_CASHIER_DEBIT,PRINT_CASHIER_CREDIT,site,file_month,source_file,is_refund,txn_datetime,txn_date,refund_amount
62,N,-3662.88,-2552.98,,-2545.88,-2552.98,,-2545.88,-2552.98,,...,0,0.0,7076-GIANPAOLO.GENTILE@WHBPI,Brighton,Nov-2025,Brighton_November_refund.csv,True,2025-11-23,2025-11-23,43.96
63,N,-3662.88,-2552.98,,-2545.88,-2552.98,,-2545.88,-2552.98,,...,0,0.0,7076-GIANPAOLO.GENTILE@WHBPI,Brighton,Nov-2025,Brighton_November_refund.csv,True,2025-11-23,2025-11-23,43.96
64,N,-3662.88,-2552.98,,-2545.88,-2552.98,,-2545.88,-2552.98,,...,0,0.0,7076-GIANPAOLO.GENTILE@WHBPI,Brighton,Nov-2025,Brighton_November_refund.csv,True,2025-11-23,2025-11-23,43.96
1,N,-9786.67,-7976.65,220.0,-300.0,0.0,,-300.0,0.0,,...,0,0.0,16176-SHUBHAM.DALVI@WHBPI,Brighton,Dec-2025,Brighton_December_refund.csv,True,2025-12-26,2025-12-26,150.0
2,N,-9786.67,-7976.65,220.0,-300.0,0.0,,-300.0,0.0,,...,0,0.0,16176-SHUBHAM.DALVI@WHBPI,Brighton,Dec-2025,Brighton_December_refund.csv,True,2025-12-26,2025-12-26,150.0
38,N,-1533.94,0.0,,-412.97,0.0,,-412.97,0.0,,...,0,0.0,10332-AYESHA.MANSOOR@WHBPI,Brighton,Jan-2026,Brighton_January_refund.csv,True,2026-01-10,2026-01-10,75.0
40,N,-1533.94,0.0,,-412.97,0.0,,-412.97,0.0,,...,0,0.0,10332-AYESHA.MANSOOR@WHBPI,Brighton,Jan-2026,Brighton_January_refund.csv,True,2026-01-10,2026-01-10,75.0
96,N,-752.55,-311.7,126.0,-10.99,0.0,,-10.99,0.0,,...,0,0.0,8537-HENRIETTA.VADAS@WHBPI,Newhaven,Jan-2026,Newheaven_January_refund.csv,True,2026-01-09,2026-01-09,10.99
98,N,-752.55,-311.7,150.0,-10.99,0.0,,-10.99,0.0,,...,0,0.0,8537-HENRIETTA.VADAS@WHBPI,Newhaven,Jan-2026,Newheaven_January_refund.csv,True,2026-01-09,2026-01-09,10.99
100,N,-752.55,-311.7,154.0,-10.99,0.0,,-10.99,0.0,,...,0,0.0,8537-HENRIETTA.VADAS@WHBPI,Newhaven,Jan-2026,Newheaven_January_refund.csv,True,2026-01-09,2026-01-09,10.99


In [19]:
keep_cols = [
    "site",
    "file_month",
    "source_file",
    "txn_datetime",
    "txn_date",
    "refund_amount",
    "BUSINESS_FORMAT_DATE",
    "RECEIPT_NO",
]

keep_cols_existing = [c for c in keep_cols if c in refunds2.columns]
processed = refunds2[keep_cols_existing].copy()

processed.shape, processed.columns



((63, 8),
 Index(['site', 'file_month', 'source_file', 'txn_datetime', 'txn_date',
        'refund_amount', 'BUSINESS_FORMAT_DATE', 'RECEIPT_NO'],
       dtype='str'))

In [20]:
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

parquet_path = PROCESSED_DIR / "refunds_processed.parquet"
csv_path = PROCESSED_DIR / "refunds_processed.csv"

# Parquet can fail if any remaining object columns exist; make it safe:
obj_cols = processed.select_dtypes(include=["object"]).columns
processed_export = processed.copy()
processed_export[obj_cols] = processed_export[obj_cols].astype("string")

processed_export.to_parquet(parquet_path, index=False)
processed_export.to_csv(csv_path, index=False)

parquet_path.exists(), csv_path.exists(), parquet_path, csv_path


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  obj_cols = processed.select_dtypes(include=["object"]).columns


(True,
 True,
 WindowsPath('c:/Projects/hotel-refund-intelligence/data/processed/refunds_processed.parquet'),
 WindowsPath('c:/Projects/hotel-refund-intelligence/data/processed/refunds_processed.csv'))

In [21]:
val_dir = PROJECT_ROOT / "reports"
val_dir.mkdir(exist_ok=True)

recon.to_csv(val_dir / "validation_row_recon.csv", index=False)
pd.DataFrame([sanity]).to_csv(val_dir / "validation_amount_sanity.csv", index=False)
null_audit.to_csv(val_dir / "validation_null_audit.csv", index=False)

(val_dir / "validation_row_recon.csv").exists()


True