In [1]:
import os
import pandas as pd
import pandera as pa
from pandera import Column, Check, DataFrameSchema

# File paths
base_dir = r"C:\Users\chang\Desktop\MS2 Platform Technology"
raw_path = os.path.join(base_dir, "event_logs.csv")
clean_path = os.path.join(base_dir, "CLEANED_eventlog.csv")

# 1 Load raw data & inspect columns
df = pd.read_csv(raw_path)
print("Columns before normalization:", df.columns.tolist())
# Normalize column names: strip whitespace, lowercase, replace spaces & hyphens with underscores
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(r"[ \-]+", "_", regex=True)
)
print("Columns after normalization:", df.columns.tolist())

# 2 Identify key columns
def find_column(candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

user_col = find_column(["user_id", "userid", "user"])
time_col = find_column(["event_time", "timestamp", "time"])
type_col = find_column(["event_type", "eventtype", "type"])
amt_col  = find_column(["amount", "amt", "value"])

print(f"Detected columns -> user: {user_col}, time: {time_col}, type: {type_col}, amount: {amt_col}")
if not all([user_col, time_col, type_col]):
    raise KeyError("Missing essential columns. Available columns: " + ", ".join(df.columns))

# 3 Initial shape and missing-value overview
print(f"▶️ Raw data shape: {df.shape}")
print("Missing % per column:")
print((df.isnull().mean() * 100).sort_values(ascending=False).head(10))

# 4 Drop high-null & irrelevant columns
high_null = df.columns[df.isnull().mean() > 0.5]
pattern_cols = [c for c in df.columns if c.startswith("col_")]
drop_cols = list(high_null) + pattern_cols
# Ensure we don't drop essential cols by accident
drop_cols = [c for c in drop_cols if c not in [user_col, time_col, type_col, amt_col]]
df = df.drop(columns=drop_cols, errors="ignore")
print(f"After dropping cols, shape: {df.shape}")

# 5 Parse & clean key fields
df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
df = df.dropna(subset=[user_col, time_col, type_col])
df = df.drop_duplicates()
print(f"After timestamp & duplicate clean, shape: {df.shape}")

# 6 Numeric & categorical fixes
if amt_col:
    df[amt_col] = pd.to_numeric(df[amt_col], errors="coerce")
    df = df[df[amt_col].ge(0)]
# Standardize event_type
allowed = ["page_view", "checkout", "wishlist_add", "profile_update"]
df[type_col] = df[type_col].str.strip().str.lower()
df = df[df[type_col].isin(allowed)]
df[type_col] = df[type_col].astype("category")
print(f"After numeric & category clean, shape: {df.shape}")

# 7 Outlier filtering on amount (1.5×IQR)
if amt_col:
    q1, q3 = df[amt_col].quantile([0.25, 0.75])
    iqr = q3 - q1
    lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    df = df[df[amt_col].between(lower, upper)]
    print(f"After outlier filter, shape: {df.shape}")

# 8 Pandera schema validation
schema_cols = {
    user_col: Column(str, nullable=False),
    time_col: Column(pa.DateTime, nullable=False),
    type_col: Column(pa.Category, Check.isin(allowed)),
}
if amt_col:
    schema_cols[amt_col] = Column(float, Check.ge(0), nullable=True)
schema = DataFrameSchema(schema_cols)
df = schema.validate(df, lazy=True)
print("✔️ Pandera validation passed")

# 9 Save cleaned data
df.to_csv(clean_path, index=False)
print(f"✅ Cleaned data saved to: {clean_path}")


Columns before normalization: ['user_id', 'event_type', 'event_time', 'product_id', 'amount', 'col_6', 'col_7', 'col_8', 'col_9', 'col_10', 'col_11', 'col_12', 'col_13', 'col_14', 'col_15', 'col_16', 'col_17', 'col_18', 'col_19', 'col_20', 'col_21', 'col_22', 'col_23', 'col_24', 'col_25', 'col_26', 'col_27', 'col_28', 'col_29', 'col_30', 'col_31', 'col_32', 'col_33', 'col_34', 'col_35', 'col_36', 'col_37', 'col_38', 'col_39', 'col_40', 'col_41', 'col_42', 'col_43', 'col_44', 'col_45', 'col_46', 'col_47', 'col_48', 'col_49', 'col_50']
Columns after normalization: ['user_id', 'event_type', 'event_time', 'product_id', 'amount', 'col_6', 'col_7', 'col_8', 'col_9', 'col_10', 'col_11', 'col_12', 'col_13', 'col_14', 'col_15', 'col_16', 'col_17', 'col_18', 'col_19', 'col_20', 'col_21', 'col_22', 'col_23', 'col_24', 'col_25', 'col_26', 'col_27', 'col_28', 'col_29', 'col_30', 'col_31', 'col_32', 'col_33', 'col_34', 'col_35', 'col_36', 'col_37', 'col_38', 'col_39', 'col_40', 'col_41', 'col_42', '

top-level pandera module will be **removed in a future version of pandera**.
If you're using pandera to validate pandas objects, we highly recommend updating
your import:

```
# old import
import pandera as pa

# new import
import pandera.pandas as pa
```

If you're using pandera to validate objects from other compatible libraries
like pyspark or polars, see the supported libraries section of the documentation
for more information on how to import pandera:

https://pandera.readthedocs.io/en/stable/supported_libraries.html


```
```



In [3]:
import os
import logging
import pandas as pd
import pandera as pa
from pandera import Column, Check, DataFrameSchema

# Setup minimal logging
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)

# File paths
base_dir   = r"C:\Users\chang\Desktop\MS2 Platform Technology"
raw_path   = os.path.join(base_dir, "event_logs.csv")

# 1) Load data with error handling
try:
    df = pd.read_csv(raw_path)
    logger.info(f"Loaded raw data with shape {df.shape}")
    error_handling = "Passed"
except Exception as e:
    logger.error(f"Failed to load raw data: {e}")
    df = pd.DataFrame()
    error_handling = f"Failed ({e})"

# Prepare report dict
report = {
    "Missing/Broken columns detected": None,
    "Schema double check": None,
    "Error Handling check": error_handling
}

# 2) Detect missing or broken columns
required = ["user_id", "event_time", "event_type", "amount"]
missing = [col for col in required if col not in df.columns]
if missing:
    report["Missing/Broken columns detected"] = missing
    logger.warning(f"Required columns missing: {missing}")
else:
    report["Missing/Broken columns detected"] = []
    logger.info("All required columns present")

# 3) Define Pandera schema for double-check
schema = DataFrameSchema({
    "user_id":    Column(str, nullable=False),
    "event_time": Column(pa.DateTime, nullable=False),
    "event_type": Column(pa.Category, Check.isin(["page_view","checkout","wishlist_add","profile_update"])),
    "amount":     Column(float, Check.ge(0), nullable=True),
})

# 4) Run schema validation, capture outcome
if not df.empty and not missing:
    try:
        schema.validate(df, lazy=True)
        report["Schema double check"] = "Passed"
        logger.info("Schema validation passed")
    except pa.errors.SchemaErrors as err:
        report["Schema double check"] = f"Failed: {len(err.failure_cases)} errors"
        logger.error("Schema validation failed:\n%s", err.failure_cases)
else:
    report["Schema double check"] = "Skipped"

# 5) Print consolidated report
print("\n=== Pipeline Health Report ===")
for k, v in report.items():
    print(f"{k}: {v}")


INFO: Loaded raw data with shape (2000, 50)
INFO: All required columns present


ERROR: Schema validation failed:
    schema_context      column  \
0           Column  event_type   
667         Column  event_type   
654         Column  event_type   
655         Column  event_type   
656         Column  event_type   
..             ...         ...   
336         Column  event_type   
337         Column  event_type   
338         Column  event_type   
339         Column  event_type   
993         Column  event_type   

                                                 check check_number  \
0    isin(['page_view', 'checkout', 'wishlist_add',...            0   
667  isin(['page_view', 'checkout', 'wishlist_add',...            0   
654  isin(['page_view', 'checkout', 'wishlist_add',...            0   
655  isin(['page_view', 'checkout', 'wishlist_add',...            0   
656  isin(['page_view', 'checkout', 'wishlist_add',...            0   
..                                                 ...          ...   
336  isin(['page_view', 'checkout', 'wishlist_add',...       


=== Pipeline Health Report ===
Missing/Broken columns detected: []
Schema double check: Failed: 994 errors
Error Handling check: Passed
