In [None]:
# Import necessary libraries for data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
import warnings
import logging

# Suppress warnings for clean output
warnings.filterwarnings("ignore")

# Set up logger
logging.basicConfig(filename="../logs/eda.log", level=logging.INFO)

# Import the EDA class from mainEda.py
from mainEda import EDA

# Initialize the EDA class with the paths to your data files
eda = EDA(train_path='../data/train.csv', test_path='../data/test.csv', store_path='../data/store.csv')

# Document the purpose of this cell
# This cell imports the necessary libraries and sets up logging.
# It also initializes the EDA class with the specified data paths.


In [None]:
# Load datasets using the EDA class
train_data = eda.load_data('../data/train.csv')
test_data = eda.load_data('../data/test.csv')
store_data = eda.load_data('../data/store.csv')

# Document the purpose of this cell
# This cell loads the train, test, and store datasets using the EDA class.
# The datasets are stored in train_data, test_data, and store_data variables respectively.


In [None]:
# Load the store dataset using the EDA class
store_data = eda.load_data('../data/store.csv')

# Check the first few rows of the dataset
store_data.head()

# Document the purpose of this cell
# This cell loads the store dataset using the EDA class and previews the first few rows of the dataset.
# The store_data variable will contain the loaded store data.


In [None]:
# Data Overview
eda.data_overview(store_data)

# Document the purpose of this cell
# This cell calls the data_overview function from the EDA class to print a detailed overview of the store_data dataset.
# It provides information on the shape, columns, data types, and descriptive statistics of the dataset.


In [None]:
# Check for missing values in the store data
eda.check_missing_values(store_data)

# Document the purpose of this cell
# This cell calls the check_missing_values function from the EDA class to check and visualize missing values in the store_data dataset.


In [None]:
# Handle missing values in the store data using the EDA class
eda.handle_missing_values()

# Check for missing values in the store data after handling missing values
eda.check_missing_values(eda.store)

# Document the purpose of this cell
# This cell calls the handle_missing_values and check_missing_values functions from the EDA class.
# It handles missing values in the store data and then checks for any remaining missing values.


In [None]:
# Visualize outliers in sales and customer count using the EDA class
eda.visualize_outliers(eda.train, ['Sales', 'Customers'], "Boxplot of Sales and Customers")

# Document the purpose of this cell
# This cell calls the visualize_outliers function from the EDA class to create a boxplot for visualizing outliers in sales and customer count.
# The boxplot helps in identifying any extreme values in these columns.


In [None]:
# Plotting sales distribution in training set using the EDA class
eda.plot_distribution(eda.train, 'Sales', 'Sales Distribution in Training Set', bins=50)

# Compare promo distribution in training and test set using the EDA class
eda.plot_distribution(eda.train, 'Promo', 'Promo Distribution: Training vs Test Set', bins=30, color='blue', label='Train', kde=True)
eda.plot_distribution(eda.test, 'Promo', 'Promo Distribution: Training vs Test Set', bins=30, color='green', label='Test', kde=True)

# Document the purpose of this cell
# This cell calls the plot_distribution function from the EDA class to plot the distribution of sales in the training set.
# It also compares the promo distribution between the training and test sets.


In [None]:
# Visualize the distribution of store types using the EDA class
eda.distribution_plot('StoreType', eda.store, "Distribution of Stores by StoreType")

# Document the purpose of this cell
# This cell calls the distribution_plot function from the EDA class to visualize the distribution of store types.
# The countplot helps in understanding the number of stores for each store type.


In [None]:
# Visualize the distribution of assortment types using the EDA class
eda.distribution_plot('Assortment', eda.store, "Distribution of Stores by Assortment")

# Document the purpose of this cell
# This cell calls the distribution_plot function from the EDA class to visualize the distribution of assortment types.
# The countplot helps in understanding the number of stores for each assortment type.
