In [2]:
# Cell 1: Import necessary libraries
import pandas as pd
import numpy as np
import os

# Set up display options for pandas DataFrames
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

print("Libraries imported successfully.")

# Cell 2: Define file paths
# Assuming CSV is in the same directory as the notebook
raw_csv_file = 'raw_sales.csv'
cleaned_csv_file = 'cleaned_sales.csv'

print(f"Input CSV file: {raw_csv_file}")
print(f"Output cleaned CSV file: {cleaned_csv_file}")

# Cell 3: Function to extract dataset from CSV
def extract_data_from_csv(file_path):
    """
    Extracts data from a CSV file into a pandas DataFrame.
    """
    try:
        df = pd.read_csv(file_path)
        print(f"\nSuccessfully extracted data from {file_path}:")
        print(df)
        return df
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found. Please ensure it exists.")
        return None
    except Exception as e:
        print(f"An error occurred while reading the CSV: {e}")
        return None

# Cell 4: Create User Defined Function (UDF) to clean the data
def clean_sales_data(df):
    """
    Cleans the sales DataFrame by converting data types and fixing missing/invalid values.
    """
    if df is None:
        return None

    print("\n--- Starting Data Cleaning ---")

    # 1. Convert order_id, product_id, quantity to integers
    # Use errors='coerce' to turn non-convertible values into NaN
    # Then fill NaN with 0 before converting to integer type (Int64 to handle NaN during conversion, then int)
    for col in ['order_id', 'product_id', 'quantity']:
        # First, attempt to convert to numeric, coercing errors to NaN
        df[col] = pd.to_numeric(df[col], errors='coerce')
        # Fill NaN values with 0
        df[col] = df[col].fillna(0)
        # Convert to appropriate integer type (using 'Int64' to allow for NaN if needed later, then to standard int)
        # For simplicity and given the 'fill with 0' instruction, we'll go directly to int if possible.
        # Handle potential negative product_id or quantity: for this task, converting to 0 for simplicity.
        if col == 'product_id' or col == 'quantity':
            df[col] = df[col].apply(lambda x: max(0, int(x))) # Ensure non-negative and integer
        else:
            df[col] = df[col].astype(int) # Standard int for order_id


    # 2. Convert price_per_unit to float
    df['price_per_unit'] = pd.to_numeric(df['price_per_unit'], errors='coerce')
    df['price_per_unit'] = df['price_per_unit'].fillna(0.0) # Fill NaN with 0.0 for float

    # 3. Convert order_date to datetime
    # Use errors='coerce' to turn unparseable dates into NaT (Not a Time)
    df['order_date'] = pd.to_datetime(df['order_date'], errors='coerce')
    # The prompt says "fix missing values with blank/0". For dates, NaN (NaT) is the equivalent of blank.
    # If a specific default date was required, we would fill NaT here. For now, leave as NaT.

    print("\n--- Data Cleaning Complete. Cleaned DataFrame Head: ---")
    print(df.head())
    print("\n--- Data Types After Cleaning ---")
    print(df.dtypes)
    return df

# Cell 5: Create UDF to calculate total_price
def calculate_total_price(quantity, price_per_unit):
    """
    Calculates total price (quantity * price_per_unit).
    Handles non-numeric inputs by returning 0 if calculation is not possible.
    """
    try:
        return quantity * price_per_unit
    except TypeError:
        return 0 # Return 0 if inputs are not numeric for calculation

# Cell 6: Main execution flow
if __name__ == "__main__":
    # Task 1: Extract the dataset
    raw_sales_df = extract_data_from_csv(raw_csv_file)

    if raw_sales_df is not None:
        # Task 2: Clean the data using UDF
        cleaned_sales_df = clean_sales_data(raw_sales_df.copy()) # Use .copy() to avoid modifying original DataFrame

        if cleaned_sales_df is not None:
            # Task 3 & 4: Create total_price column using UDF
            # Apply the UDF row-wise
            cleaned_sales_df['total_price'] = cleaned_sales_df.apply(
                lambda row: calculate_total_price(row['quantity'], row['price_per_unit']),
                axis=1
            )
            print("\n--- DataFrame with 'total_price' column ---")
            print(cleaned_sales_df)

            # Task 5: Save the cleaned DataFrame
            try:
                cleaned_sales_df.to_csv(cleaned_csv_file, index=False)
                print(f"\nCleaned DataFrame saved successfully to '{cleaned_csv_file}'")
            except Exception as e:
                print(f"An error occurred while saving the cleaned CSV: {e}")

Libraries imported successfully.
Input CSV file: raw_sales.csv
Output cleaned CSV file: cleaned_sales.csv

Successfully extracted data from raw_sales.csv:
   order_id  product_id  quantity  price_per_unit  order_date
0         1         101       2.0           20.00  2025/06/01
1         2         102      -1.0           15.50  2025/06/01
2         3         103       1.0           35.00  2025-06-01
3         4         104       3.0           20.00  2025-06-02
4         5         105       NaN           99.00  06-03-2025
5         6         106       2.0           25.99  2025-06-03

--- Starting Data Cleaning ---

--- Data Cleaning Complete. Cleaned DataFrame Head: ---
   order_id  product_id  quantity  price_per_unit order_date
0         1         101         2            20.0 2025-06-01
1         2         102         0            15.5 2025-06-01
2         3         103         1            35.0        NaT
3         4         104         3            20.0        NaT
4         5      