In [1]:
import sys
import os
import pandas as pd
import datetime
import time
import inspect
import json
import ipykernel
import requests

# Logging function with timestamp
from datetime import datetime as dt

NOTEBOOK_NAME = "01_data_cleaning.ipynb"

# Start timing
start_time = time.time()


# Add utility functions folder to path
curr_dir = os.getcwd()
util_functions_dir = "../../../utility functions"
data_file = "../../../generic_data_files/pan_shop_and_sales.csv"
cleaned_data_dir = "../../../cleaned_data_files/"


# Common Utilities (data_utils) Path
UTILITY_FUNCTIONS_DIR_PATH = os.path.abspath(os.path.join(curr_dir, util_functions_dir))
if UTILITY_FUNCTIONS_DIR_PATH not in sys.path:
    sys.path.append(UTILITY_FUNCTIONS_DIR_PATH)

# Data (csv) Path
DATA_FILE_PATH = os.path.abspath(os.path.join(curr_dir, data_file))

from data_utils import preprocess_data, save_cleaned_data  # type: ignore
from log_utils import log, set_log_source  # type: ignore

# Set the notebook name globally once
set_log_source("01_data_cleaning.ipynb")

log("🔄 Starting data cleaning process...")

log(f"Current Working Dir: {curr_dir}")
log(f"Util Functions Dir: {util_functions_dir}")

log(f"Util Functions Dir Path: {UTILITY_FUNCTIONS_DIR_PATH}")
log(f"Data File Path: {DATA_FILE_PATH}")

# Pre Process Data (Load, Standardize cols, Duplicates, Null check and handling null values)
df_cleaned = preprocess_data(DATA_FILE_PATH)

CLEANED_DATA_DIR = os.path.abspath(os.path.join(curr_dir, cleaned_data_dir))
os.makedirs(CLEANED_DATA_DIR, exist_ok=True)

# Append Timestamp for File to preserve uniqueness and not overriding old data
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
cleaned_file_name = (
    f"cleaned_{os.path.splitext(os.path.basename(data_file))[0]}_{timestamp}.csv"
)

# Save Cleaned Data
CLEANED_DATA_PATH = os.path.join(CLEANED_DATA_DIR, cleaned_file_name)
save_cleaned_data(df_cleaned, CLEANED_DATA_PATH)
log(f"Cleaned Data File Saved to Path: : {CLEANED_DATA_PATH}")

# Test Cleaned Data
log(df_cleaned.describe().T.sample(1))
log(df_cleaned.isnull().sum().sum())

# End timing
end_time = time.time()
log("🔄 Ending Cleaning process..")
log(f"✅ Total time taken: {end_time - start_time:.2f} seconds")

2025-09-03 20:54:51 | 01_data_cleaning.ipynb | 🔄 Starting data cleaning process...
2025-09-03 20:54:51 | 01_data_cleaning.ipynb | Current Working Dir: e:\Git-Repos\artificial-intelligence-latest\exploratory data analysis\pan_shop_sales\notebooks
2025-09-03 20:54:51 | 01_data_cleaning.ipynb | Util Functions Dir: ../../../utility functions
2025-09-03 20:54:51 | 01_data_cleaning.ipynb | Util Functions Dir Path: e:\Git-Repos\artificial-intelligence-latest\utility functions
2025-09-03 20:54:51 | 01_data_cleaning.ipynb | Data File Path: e:\Git-Repos\artificial-intelligence-latest\generic_data_files\pan_shop_and_sales.csv
2025-09-03 20:54:51 | data_utils.py | 🚀 Starting full data preprocessing pipeline
2025-09-03 20:54:51 | data_utils.py | Loading data file : pan_shop_and_sales.csv - Start
2025-09-03 20:54:51 | data_utils.py | 📥 Reading CSV: e:\Git-Repos\artificial-intelligence-latest\generic_data_files\pan_shop_and_sales.csv
2025-09-03 20:54:51 | data_utils.py | ✅ Loaded CSV with shape: (773

In [2]:
import sys

print(sys.executable)

e:\Git-Repos\artificial-intelligence-latest\.venv\Scripts\python.exe
