In [1]:
# Parameters
mode = "fake"
input_path = ["C:\\store\\git\\km-stat-activity\\parquet_dataset\\date=2025-04-24\\profile_guid=ad38431b-2bb8-48f8-a3d5-f98535ea461a\\part.26.parquet"]
output_path = "C:\\store\\git\\km-stat-activity\\processed\\fake\\profile_guid=ad38431b-2bb8-48f8-a3d5-f98535ea461a\\2025-04-24-processed.csv"


In [2]:
# Parameters
try:
    mode
except NameError:
    mode = None

try:
    input_path
except NameError:
    input_path = None

try:
    output_path
except NameError:
    output_path = None


In [3]:
import os
import pandas as pd
from pathlib import Path
import ast

def parse_array_str(arr_str):
    try:
        cleaned = arr_str.replace('[', '').replace(']', '').strip().replace(' ', ',')
        return [int(x) for x in cleaned.split(',') if x]
    except Exception:
        return []

# 1. df'yi yükle — önce output_path'ten, yoksa input_path'ten
if os.path.exists(output_path):
    df = pd.read_csv(output_path, parse_dates=['start_date_time', 'end_date_time'])
else:
    if isinstance(input_path, list):
        paths = [Path(p) for p in input_path]
        df = pd.concat([pd.read_parquet(p, engine="pyarrow") for p in paths], ignore_index=True)
    else:
        df = pd.read_csv(str(input_path), parse_dates=['start_date_time', 'end_date_time'])

# x ve y sütunlarını parse et
df["x"] = df["x"].astype(str).apply(parse_array_str)
df["y"] = df["y"].astype(str).apply(parse_array_str)


# 2. bounding box hesapla
def get_bounding_box(x, y):
    if x is None or y is None:
        return (0, 0, 0, 0)
    if not isinstance(x, (list, tuple)) or not isinstance(y, (list, tuple)):
        return (0, 0, 0, 0)
    if len(x) == 0 or len(y) == 0:
        return (0, 0, 0, 0)
    return (min(x), min(y), max(x), max(y))


df["bounding_box"] = df.apply(lambda row: get_bounding_box(row["x"], row["y"]), axis=1)
df["min_x"] = df["bounding_box"].apply(lambda b: b[0])
df["min_y"] = df["bounding_box"].apply(lambda b: b[1])
df["max_x"] = df["bounding_box"].apply(lambda b: b[2])
df["max_y"] = df["bounding_box"].apply(lambda b: b[3])

# 3. Hatalı verileri temizleyip float'a dönüştür
for col in ["min_x", "min_y", "max_x", "max_y"]:
    df[col] = pd.to_numeric(
        df[col].astype(str).str.replace(",", ".", regex=False),
        errors="coerce"
    )

# 4. bbox_area hesapla
df["bbox_area"] = (df["max_x"] - df["min_x"]) * (df["max_y"] - df["min_y"])

# 5. Yaz
df.to_csv(output_path, index=False)
print(f"✔️ Saved CSV with full df including bounding box features: {output_path}")


✔️ Saved CSV with full df including bounding box features: C:\store\git\km-stat-activity\processed\fake\profile_guid=ad38431b-2bb8-48f8-a3d5-f98535ea461a\2025-04-24-processed.csv
