In [1]:
import pandas as pd
import sys

INPUT_FILE = "recommendation_logs_cleaned.csv"
OUTPUT_FILE = "recommendation_logs_mysql_ready.csv"

def main():
    try:
        # 1. Load data
        df = pd.read_csv(INPUT_FILE)
        print("‚úÖ File loaded successfully")

        # 2. Validate required columns
        required_columns = [
            "recommendation_id",
            "user_id",
            "movie_id",
            "recommendation_date",
            "recommendation_type",
            "recommendation_score",
            "was_clicked",
            "position_in_list",
            "device_type",
            "time_of_day",
            "algorithm_version"
        ]

        missing = set(required_columns) - set(df.columns)
        if missing:
            raise ValueError(f"Missing columns: {missing}")

        # 3. Repair DATE (DD-MM-YYYY ‚Üí YYYY-MM-DD)
        df["recommendation_date"] = pd.to_datetime(
            df["recommendation_date"],
            format="%d-%m-%Y",
            errors="raise"
        ).dt.strftime("%Y-%m-%d")

        # 4. Repair BOOLEAN for MySQL (True/False ‚Üí 1/0)
        df["was_clicked"] = df["was_clicked"].astype(int)

        # 5. Final validation
        if df.isnull().any().any():
            raise ValueError("Null values found after cleaning")

        # 6. Save MySQL-ready CSV
        df.to_csv(OUTPUT_FILE, index=False)
        print(f"‚úÖ Data repaired and saved as: {OUTPUT_FILE}")

        # 7. Quick sanity check
        print("\nüìä Final Data Types:")
        print(df.dtypes)

        print("\nüìà Row count:", len(df))

    except Exception as e:
        print("‚ùå ERROR:", e)
        sys.exit(1)

if __name__ == "__main__":
    main()


‚úÖ File loaded successfully
‚úÖ Data repaired and saved as: recommendation_logs_mysql_ready.csv

üìä Final Data Types:
recommendation_id        object
user_id                  object
movie_id                 object
recommendation_date      object
recommendation_type      object
recommendation_score    float64
was_clicked               int64
position_in_list          int64
device_type              object
time_of_day              object
algorithm_version        object
dtype: object

üìà Row count: 50000
