Reapir Movies_Cleanned Data

In [13]:
import pandas as pd

# -------------------------------
# Load dataset
# -------------------------------
input_path = "movies_cleaned.csv"
df = pd.read_csv(input_path)

# -------------------------------
# Normalize column names
# (fix case + hidden spaces)
# -------------------------------
df.columns = (
    df.columns
    .str.strip()
    .str.replace(" ", "_")
    .str.lower()
)

# Helper to safely get column name
def col(name):
    name = name.lower()
    if name not in df.columns:
        raise ValueError(f"Missing column: {name}")
    return name

# -------------------------------
# 1. imdb_rating → float
# -------------------------------
df[col("imdb_rating")] = pd.to_numeric(
    df[col("imdb_rating")], errors="coerce"
).astype(float)

# -------------------------------
# 2. box_office_revenue → float
# -------------------------------
df[col("box_office_revenue")] = (
    df[col("box_office_revenue")]
    .astype(str)
    .str.replace(r"[^\d.]", "", regex=True)
)

df[col("box_office_revenue")] = pd.to_numeric(
    df[col("box_office_revenue")], errors="coerce"
).astype(float)

# -------------------------------
# 3. is_netflix_original → 1 / 0
# -------------------------------
df[col("is_netflix_original")] = (
    df[col("is_netflix_original")]
    .astype(str)
    .str.lower()
    .map({"true": 1, "false": 0, "yes": 1, "no": 0, "1": 1, "0": 0})
)

# -------------------------------
# 4. Added_to_platform
# DD:MM:YYYY → YYYY:MM:DD
# -------------------------------
df["added_to_platform"] = (
    pd.to_datetime(
        df["added_to_platform"].astype(str).str.strip(),
        dayfirst=True,
        errors="coerce"
    )
    .dt.strftime("%Y:%m:%d")
)

# -------------------------------
# 5. Content_warning → 1 / 0
# -------------------------------
df[col("content_warning")] = (
    df[col("content_warning")]
    .astype(str)
    .str.lower()
    .map({"true": 1, "false": 0, "yes": 1, "no": 0, "1": 1, "0": 0})
)

# -------------------------------
# 6. Rating (numeric → string code)
# -------------------------------
rating_map = {
    "0": "G",
    "7": "PG",
    "10": "PG",
    "13": "PG-13",
    "16": "PG-13",
    "18": "R"
}

df[col("rating")] = (
    df[col("rating")]
    .astype(str)
    .map(rating_map)
)

# -------------------------------
# Save cleaned data
# -------------------------------
output_path = "movies_fixed.csv"
df.to_csv(output_path, index=False)

print("✅ Data cleaned successfully")
print(f"Saved to: {output_path}")


✅ Data cleaned successfully
Saved to: movies_fixed.csv
