In [1]:
# Parameters
# papermill: parameters
# tags: [parameters]

mode = None
input_path = None
output_path = None


In [2]:
# Parameters
mode = "fake"
input_path = ["C:\\store\\git\\km-stat-activity\\parquet_dataset\\date=2025-04-21\\profile_guid=011d3524-6fce-4184-bfa3-9707e8c6f18c\\part.6.parquet"]
output_path = "C:\\store\\git\\km-stat-activity\\processed\\fake\\profile_guid=011d3524-6fce-4184-bfa3-9707e8c6f18c\\2025-04-21-processed.csv"


In [3]:

import math
import pandas as pd
from pathlib import Path
import os

# Papermill’den gelen parametreler:
#   mode: "fake" veya "real"
#   input_path: ya bir liste (parquet yolları) ya da bir string (csv yolu)
#   output_path: yazılacak csv’nin tam yolu

if isinstance(input_path, list):
    # --- Fake pipeline: .parquet dosyalarını birleştir ---
    # input_path zaten ["...part1.parquet", "...part2.parquet", ...]
    paths = [Path(p) for p in input_path]
    df = pd.concat(
        [pd.read_parquet(p, engine="pyarrow") for p in paths],
        ignore_index=True
    )
else:
    # --- Real pipeline: tek bir CSV oku ---
    csv_path = Path(input_path)
    df = pd.read_csv(
        csv_path,
        converters={"x": pd.eval, "y": pd.eval},
        parse_dates=["start_date_time","end_date_time"]
    )

# 2) Bounding‐box hesaplayıcı
def compute_bbox(xs, ys):
    # liste değilse veya boşsa kutu 0,0,0,0 dön
    if not isinstance(xs, (list, tuple)) or not isinstance(ys, (list, tuple)):
        return (0,0,0,0)
    if len(xs) == 0 or len(ys) == 0:
        return (0,0,0,0)
    return (min(xs), min(ys), max(xs), max(ys))

# 3) Satır bazında bounding_box ve area ekleyin
#    a) bounding_box sütunu: (min_x, min_y, max_x, max_y)
df["bounding_box"] = df.apply(lambda row: compute_bbox(row["x"], row["y"]), axis=1)

#    b) area’yı da ayrı sütunlara ayırın
bb_df = pd.DataFrame(df["bounding_box"].tolist(), 
                     columns=["min_x", "min_y", "max_x", "max_y"],
                     index=df.index)
df = pd.concat([df, bb_df], axis=1)

#    c) alanı hesaplayın
df["bbox_area"] = (df["max_x"] - df["min_x"]) * (df["max_y"] - df["min_y"])

df_result = df[["bounding_box", "min_x", "min_y", "max_x", "max_y", "bbox_area"]]

# df_result zaten hesaplandı varsayılıyor
if os.path.exists(output_path):
    df_existing = pd.read_csv(output_path)
    df_merged = pd.concat(
        [df_existing.reset_index(drop=True), df_result.reset_index(drop=True)],
        axis=1
    )
else:
    df_merged = df_result

df_merged.to_csv(output_path, index=False)
print(f"✅ Bounding‐box eklendi ve kaydedildi: {output_path}")



✅ Bounding‐box eklendi ve kaydedildi: C:\store\git\km-stat-activity\processed\fake\profile_guid=011d3524-6fce-4184-bfa3-9707e8c6f18c\2025-04-21-processed.csv
