In [32]:
import pandas as pd
import numpy as np

In [33]:
# Load the raw data
def load_data(file_path):
    """Load the raw sales data from a CSV file."""
    return pd.read_csv(file_path)

In [34]:
# Clean the data
def clean_data(df):
    """Clean and preprocess the sales data."""
    # Remove duplicates
    df = df.drop_duplicates()
    
    # Remove rows with missing values
    df = df.dropna()
    
    # Standardize category names
    df['Category'] = df['Category'].str.strip().str.title()
    
    # Convert 'Date' column to datetime
    df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d")
    
    #Convert 'Time' column to a proper time format
    df['Time'] = pd.to_datetime(df['Time'], format="%H:%M").dt.time
    
    # Ensure numeric columns are correctly typed
    df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
    df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce')
    df['Total_Spent'] = pd.to_numeric(df['Total_Spent'], errors='coerce')
    
    return df

In [35]:
# Save cleaned data to a new CSV file
def save_cleaned_data(df, output_path):
    """Save the cleaned data to a new CSV file."""
    df.to_csv(output_path, index=False)
    print(f"Cleaned data saved to {output_path}")

In [36]:
# Main script logic
def main():
    input_file = "../data/raw/tillys_mock_sales_data.csv"
    output_file = "../data/processed/cleaned_sales_data.csv"
    
    # Load raw data
    print("Loading raw data...")
    raw_data = load_data(input_file)
    
    # Clean the data
    print("Cleaning data...")
    cleaned_data = clean_data(raw_data)
    
    # Save cleaned data
    print("Saving clean data...")
    save_cleaned_data(cleaned_data, output_file)

In [37]:
if __name__ == "__main__":
    main()

Loading raw data...
Cleaning data...
Saving clean data
Cleaned data saved to ../data/processed/cleaned_sales_data.csv
