# Fix Multi-line & Missing Commas in x/y and Compute Idle Ratio & Movement Entropy
This notebook:
1. Reads your processed CSV
2. Preprocesses x/y list strings to ensure commas between numbers
3. Uses `ast.literal_eval` to parse into Python lists
4. Computes `mouse_idle_ratio` and `movement_entropy`
5. Writes new CSV with `_with_idle_entropy_fixed2` suffix

In [1]:
# Parameters
mode = "fake"
input_path = ["C:\\store\\git\\km-stat-activity\\parquet_dataset\\date=2025-04-22\\profile_guid=49994cbe-706e-4ab9-b0e1-02b12ba620b3\\part.13.parquet"]
output_path = "C:\\store\\git\\km-stat-activity\\processed\\fake\\profile_guid=49994cbe-706e-4ab9-b0e1-02b12ba620b3\\2025-04-22-processed.csv"


In [2]:
# Parameters
try:
    mode
except NameError:
    mode = None

try:
    input_path
except NameError:
    input_path = None

try:
    output_path
except NameError:
    output_path = None



In [3]:
import os
import pandas as pd
from pathlib import Path
import ast
import numpy as np

# 1. Veri Yükleme fonksiyonu
def load_df(input_path, output_path):
    if output_path and os.path.exists(output_path):
        df = pd.read_csv(output_path)
    elif isinstance(input_path, list):
        paths = [Path(p) for p in input_path]
        df = pd.concat([pd.read_parquet(p, engine="pyarrow") for p in paths], ignore_index=True)
    elif input_path:
        df = pd.read_csv(input_path)
    else:
        raise ValueError("input_path veya output_path geçerli değil")
    return df

df = load_df(input_path, output_path)

# .1, .2 gibi tekrar eden sütunları temizle
df = df.loc[:, ~df.columns.str.contains(r'\.\d+$')]

# x ve y stringlerini listeye dönüştür
def safe_parse_list(s):
    if isinstance(s, list):
        return s
    try:
        return list(ast.literal_eval(s))
    except:
        return []

df['x'] = df['x'].apply(safe_parse_list)
df['y'] = df['y'].apply(safe_parse_list)

# Idle hareket süresi oranı
def compute_idle_ratio(x):
    if not x or len(x) < 2:
        return 1.0
    diffs = np.diff(x)
    idle_count = np.sum(diffs == 0)
    return idle_count / len(diffs)

df["mouse_idle_ratio"] = df["x"].apply(compute_idle_ratio)

# Hareket entropisi
def movement_entropy(x):
    if not x or len(x) < 2:
        return 0.0
    diffs = np.diff(x)
    values, counts = np.unique(diffs, return_counts=True)
    probabilities = counts / counts.sum()
    return -np.sum(probabilities * np.log2(probabilities))

df["movement_entropy"] = df["x"].apply(movement_entropy)

# 5. Sonuçları CSV’ye yaz
df_result = df
df_result.to_csv(output_path, index=False)
print(f"✔️ Saved CSV with full df including idle ratio and entropy: {output_path}")


✔️ Saved CSV with full df including idle ratio and entropy: C:\store\git\km-stat-activity\processed\fake\profile_guid=49994cbe-706e-4ab9-b0e1-02b12ba620b3\2025-04-22-processed.csv
