#### Data Columns - Breakup


 Data : Pan Shop Sales Analysis


In [1]:
import os, sys
import time


NOTEBOOK_NAME = "02_data_correlation_analysis.ipynb"

# Start Time
start_time = time.time()

curr_dir = os.getcwd()
util_functions_dir = "../../../utility functions"
# Common Utilities (data_utils) Path
UTILITY_FUNCTIONS_DIR_PATH = os.path.abspath(os.path.join(curr_dir, util_functions_dir))
if UTILITY_FUNCTIONS_DIR_PATH not in sys.path:
    sys.path.append(UTILITY_FUNCTIONS_DIR_PATH)


from data_utils import get_dataframe_by_partial_file_name, get_cat_and_con_cols_list
from log_utils import log, set_log_source
from correlation_utils import imp_cat_cols, get_imp_con_cols

# Set the notebook name globally once
set_log_source(NOTEBOOK_NAME)

log("🔄 Starting data correlation analysis...")

# As always cleaned data exists in below folder only hard coding the value
data_dir = "../../../cleaned_data_files/"
partial_file_name = "cleaned_pan_shop_and"

# DataFrame for correlation analysis
df = get_dataframe_by_partial_file_name(data_dir, partial_file_name)

# Target
target = "profit"
if target not in df.columns:
    log(f"❌ Target column '{target}' not found in the DataFrame!")
    print(f"[ERROR] Target column '{target}' not found!", file=sys.stderr)
    sys.exit(1)

# Categorical and Continuous Columns
cat_cols, con_cols = get_cat_and_con_cols_list(df)
log(f" Categorical Columns : {cat_cols}")
log(f" Continuous Columns : {con_cols}")


corr_threshold = 0.4

# Fetch important continuous columns with correlation > threshold
imp_con_cols_list = []

imp_con_corr = get_imp_con_cols(df, target, corr_threshold)

if not imp_con_corr.empty:
    log(f"✅ Important continuous columns (corr > {corr_threshold}):")
    for col, val in imp_con_corr.items():
        imp_con_cols_list.append(col)
        log(f"   - {col}: {val:.2f}")
else:
    log(f"⚠️ No cont features found with corr > {corr_threshold} to target: '{target}'")


# Fetch important categorical columns with p-val >= threshold
imp_cat_cols_list = []
p_val_threshold = 0.2
cramers_df = imp_cat_cols(df, cat_cols, target, p_val_threshold)

if not cramers_df.empty:
    log(f"✅ Important categorical features (Cramér’s V ≥ {p_val_threshold}):")
    for _, row in cramers_df.iterrows():
        imp_cat_cols_list.append(row["Feature"])
        log(f"   - {row['Feature']}: {row['Cramers_V']:.2f}")
else:
    log(f"⚠️ No categorical features with Cramér’s V ≥ {p_val_threshold}")

log(f"Imp Continuous columns list : {imp_con_cols_list}")
log(f"Imp categorical columns list : {imp_cat_cols_list}")

features_list = list(set(imp_cat_cols_list + imp_con_cols_list))
log(f"Imp features columns list : {features_list}")

log(f"Current Working Dir: {curr_dir}")
log(f"Util Functions Dir: {util_functions_dir}")
log(f"Data File Dir: {data_dir}")


# End Time
end_time = time.time()
log("🔄 Completed data correlation analysis...")
log(f"✅ Total time taken: {end_time - start_time:.2f} seconds")

2025-09-04 02:36:05 | 02_data_correlation_analysis.ipynb | 🔄 Starting data correlation analysis...
2025-09-04 02:36:05 | data_utils.py | 🚀 Fetching data frame from partial file name
2025-09-04 02:36:05 | data_utils.py | 📄 Matched file: cleaned_pan_shop_and_sales_20250903_223450.csv
2025-09-04 02:36:05 | data_utils.py | 📥 Reading CSV: ../../../cleaned_data_files/cleaned_pan_shop_and_sales_20250903_223450.csv
2025-09-04 02:36:05 | data_utils.py | ✅ Loaded CSV with shape: (7736, 17)
2025-09-04 02:36:05 | data_utils.py | ⏱️ Time taken to read CSV: 0.04 seconds
2025-09-04 02:36:05 | data_utils.py | ✅ Fetching DataFrame completed in 0.04 seconds
2025-09-04 02:36:05 | data_utils.py | 🚀 Fetching cat and cont columns from dataframe 
2025-09-04 02:36:05 | data_utils.py | 🚀 Fetched cat and cont columns from dataframe 
2025-09-04 02:36:05 | 02_data_correlation_analysis.ipynb |  Categorical Columns : ['name', 'timings', 'owner', 'location', 'city', 'state', 'date', 'time', 'item', 'item_description