# Data Cleaning and Loading

This notebook handles loading and cleaning all three CSV files (sales data, stores data, and features data).

## Import Required Libraries

In [None]:
import pandas as pd

## Load CSV Files

In [None]:
# Load all three CSV files
sales_data = pd.read_csv('DATA/sales data-set.csv')
stores_data = pd.read_csv('DATA/stores data-set.csv')
features_data = pd.read_csv('DATA/Features data set.csv')

print("Files loaded successfully!")

## Convert Date Columns to DateTime Format

In [None]:
# Convert Date columns to datetime
sales_data['Date'] = pd.to_datetime(sales_data['Date'], format='%d/%m/%Y')
features_data['Date'] = pd.to_datetime(features_data['Date'], format='%d/%m/%Y')

print("Dates converted to datetime format!")

## Check for Duplicates

In [None]:
# Check for duplicates in each dataset
print("Checking for duplicates...")
sales_dupes = sales_data.duplicated().sum()
stores_dupes = stores_data.duplicated().sum()
features_dupes = features_data.duplicated().sum()

print(f"  sales_data: {sales_dupes} duplicates")
print(f"  stores_data: {stores_dupes} duplicates")
print(f"  features_data: {features_dupes} duplicates")

## Remove Duplicates

In [None]:
# Remove duplicates if found
if sales_dupes > 0:
    sales_data = sales_data.drop_duplicates()
    print(f"  Removed {sales_dupes} duplicates from sales_data")

if stores_dupes > 0:
    stores_data = stores_data.drop_duplicates()
    print(f"  Removed {stores_dupes} duplicates from stores_data")

if features_dupes > 0:
    features_data = features_data.drop_duplicates()
    print(f"  Removed {features_dupes} duplicates from features_data")

## Clean Features Data - Handle Missing Values

In [None]:
# Fill NaN values in MarkDown columns with mean
features_data['MarkDown1'].fillna(features_data['MarkDown1'].mean(), inplace=True)
features_data['MarkDown2'].fillna(features_data['MarkDown2'].mean(), inplace=True)
features_data['MarkDown3'].fillna(features_data['MarkDown3'].mean(), inplace=True)
features_data['MarkDown4'].fillna(features_data['MarkDown4'].mean(), inplace=True)
features_data['MarkDown5'].fillna(features_data['MarkDown5'].mean(), inplace=True)

# Fill NaN values in other columns with mean
features_data['CPI'].fillna(features_data['CPI'].mean(), inplace=True)
features_data['Unemployment'].fillna(features_data['Unemployment'].mean(), inplace=True)

print("Data cleaned successfully!")

## Data Summary

In [None]:
# Display summary information about cleaned datasets
print("\nSALES DATA:")
print(f"  Shape: {sales_data.shape}")
print(f"  Columns: {list(sales_data.columns)}")

print("\nSTORES DATA:")
print(f"  Shape: {stores_data.shape}")
print(f"  Columns: {list(stores_data.columns)}")

print("\nFEATURES DATA:")
print(f"  Shape: {features_data.shape}")
print(f"  Columns: {list(features_data.columns)}")