Reapir Movies_Cleanned Data

In [None]:
import pandas as pd

# 1️⃣ Read CSV explicitly as UTF-8 (no guessing)
df = pd.read_csv("movies_cleaned.csv", encoding="utf-8")

# 2️⃣ FIX BOOLEAN COLUMNS → 0 / 1 ONLY
bool_cols = ["is_netflix_original", "content_warning"]

for col in bool_cols:
    df[col] = (
        df[col]
        .astype(str)
        .str.strip()
        .str.lower()
        .replace({"true": 1, "false": 0, "yes": 1, "no": 0})
    )
    df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)

# 3️⃣ FIX DATE COLUMN → YYYY-MM-DD
df["added_to_platform"] = (
    pd.to_datetime(df["added_to_platform"], errors="coerce")
    .dt.strftime("%Y-%m-%d")
)

# 4️⃣ FIX NUMERIC COLUMNS (remove 'Unknown')
numeric_cols = [
    "duration_minutes",
    "imdb_rating",
    "production_budget",
    "box_office_revenue",
    "number_of_seasons",
    "number_of_episodes"
]

for col in numeric_cols:
    df[col] = (
        df[col]
        .replace("Unknown", "")
        .replace("unknown", "")
    )
    df[col] = pd.to_numeric(df[col], errors="coerce")

# 5️⃣ FIX YEAR COLUMN
df["release_year"] = pd.to_numeric(
    df["release_year"], errors="coerce"
).astype("Int64")

# 6️⃣ REMOVE ACCIDENTAL INDEX COLUMN (VERY COMMON MISTAKE)
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]

# 7️⃣ SAVE CLEAN FILE (NO INDEX, UTF-8)
df.to_csv(
    "movies_mysql_ready.csv",
    index=False,
    encoding="utf-8"
)

print("✅ CSV fully repaired and safe for MySQL import.")
