In [1]:
import pandas as pd

# Load dataset
input_path = "watch_history_cleaned.csv"
df = pd.read_csv(input_path)

# -------------------------------
# 1. Convert numeric columns to float
# -------------------------------
numeric_cols = ["watch_duration_minutes", "progress_percentage"]

for col in numeric_cols:
    if col not in df.columns:
        raise ValueError(f"Missing column: {col}")
    df[col] = pd.to_numeric(df[col], errors="coerce").astype(float)

# -------------------------------
# 2. Convert watch_date to YYYY:MM:DD
# -------------------------------
if "watch_date" not in df.columns:
    raise ValueError("Missing column: watch_date")

df["watch_date"] = pd.to_datetime(
    df["watch_date"], errors="coerce"
).dt.strftime("%Y:%m:%d")

# -------------------------------
# 3. Convert is_download true/false → 1/0
# -------------------------------
if "is_download" not in df.columns:
    raise ValueError("Missing column: is_download")

df["is_download"] = (
    df["is_download"]
    .astype(str)
    .str.lower()
    .map({"true": 1, "false": 0})
)

# -------------------------------
# 4. Convert quality: 4k → fk
# -------------------------------
if "quality" not in df.columns:
    raise ValueError("Missing column: quality")

df["quality"] = (
    df["quality"]
    .astype(str)
    .str.lower()
    .replace({"4k": "fk"})
)

# -------------------------------
# Save cleaned file
# -------------------------------
output_path = "watch_history_fixed.csv"
df.to_csv(output_path, index=False)

print("✅ Data cleaning complete")
print(f"Saved to: {output_path}")


  df["watch_date"] = pd.to_datetime(


✅ Data cleaning complete
Saved to: watch_history_fixed.csv
