In [None]:
# Import necessary libraries for data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
import warnings
import logging

# Suppress warnings for clean output
warnings.filterwarnings("ignore")

# Set up logger
logging.basicConfig(filename="../logs/eda.log", level=logging.INFO)

# Import the EDA class from mainEda.py
from mainEda import EDA

# Initialize the EDA class with the paths to your data files
eda = EDA(train_path='../data/train.csv', test_path='../data/test.csv', store_path='../data/store.csv')

# Document the purpose of this cell
# This cell imports the necessary libraries and sets up logging.
# It also initializes the EDA class with the specified data paths.


In [None]:
# Load datasets using the EDA class
train_data = eda.load_data('../data/train.csv')
test_data = eda.load_data('../data/test.csv')
store_data = eda.load_data('../data/store.csv')

# Document the purpose of this cell
# This cell loads the train, test, and store datasets using the EDA class.
# The datasets are stored in train_data, test_data, and store_data variables respectively.


In [None]:
# Load the store dataset using the EDA class
store_data = eda.load_data('../data/store.csv')

# Check the first few rows of the dataset
store_data.head()

# Document the purpose of this cell
# This cell loads the store dataset using the EDA class and previews the first few rows of the dataset.
# The store_data variable will contain the loaded store data.


In [None]:
# Data Overview
eda.data_overview(store_data)

# Document the purpose of this cell
# This cell calls the data_overview function from the EDA class to print a detailed overview of the store_data dataset.
# It provides information on the shape, columns, data types, and descriptive statistics of the dataset.


In [None]:
# Check for missing values in the store data
eda.check_missing_values(store_data)

# Document the purpose of this cell
# This cell calls the check_missing_values function from the EDA class to check and visualize missing values in the store_data dataset.


In [None]:
# Handle missing values in the store data using the EDA class
eda.handle_missing_values()

# Check for missing values in the store data after handling missing values
eda.check_missing_values(eda.store)

# Document the purpose of this cell
# This cell calls the handle_missing_values and check_missing_values functions from the EDA class.
# It handles missing values in the store data and then checks for any remaining missing values.


In [None]:
# Visualize outliers in sales and customer count using the EDA class
eda.visualize_outliers(eda.train, ['Sales', 'Customers'], "Boxplot of Sales and Customers")

# Document the purpose of this cell
# This cell calls the visualize_outliers function from the EDA class to create a boxplot for visualizing outliers in sales and customer count.
# The boxplot helps in identifying any extreme values in these columns.


In [None]:
# Plotting sales distribution in training set using the EDA class
eda.plot_distribution(eda.train, 'Sales', 'Sales Distribution in Training Set', bins=50)

# Compare promo distribution in training and test set using the EDA class
eda.plot_distribution(eda.train, 'Promo', 'Promo Distribution: Training vs Test Set', bins=30, color='blue', label='Train', kde=True)
eda.plot_distribution(eda.test, 'Promo', 'Promo Distribution: Training vs Test Set', bins=30, color='green', label='Test', kde=True)

# Document the purpose of this cell
# This cell calls the plot_distribution function from the EDA class to plot the distribution of sales in the training set.
# It also compares the promo distribution between the training and test sets.


In [None]:
# Visualize the distribution of store types using the EDA class
eda.distribution_plot('StoreType', eda.store, "Distribution of Stores by StoreType")

# Document the purpose of this cell
# This cell calls the distribution_plot function from the EDA class to visualize the distribution of store types.
# The countplot helps in understanding the number of stores for each store type.


In [None]:
# Visualize the distribution of assortment types using the EDA class
eda.distribution_plot('Assortment', eda.store, "Distribution of Stores by Assortment")

# Document the purpose of this cell
# This cell calls the distribution_plot function from the EDA class to visualize the distribution of assortment types.
# The countplot helps in understanding the number of stores for each assortment type.


In [None]:
# Visualize the distribution of CompetitionDistance using the EDA class
eda.distribution_plot('CompetitionDistance', eda.store, 'Distribution of CompetitionDistance', plot_type='histplot')

# Document the purpose of this cell
# This cell calls the distribution_plot function from the EDA class to visualize the distribution of CompetitionDistance.
# The histogram plot helps in understanding the distribution and density of competition distances among stores.


In [None]:
# Analyze sales during holidays using the EDA class
eda.holiday_sales_analysis()

# Document the purpose of this cell
# This cell calls the holiday_sales_analysis function from the EDA class to analyze sales behavior before, during, and after state holidays.
# It visualizes the average sales for each state holiday using a bar plot.


In [None]:
# Visualize seasonal sales trends using the EDA class
eda.seasonal_sales_trends()

# Document the purpose of this cell
# This cell calls the seasonal_sales_trends function from the EDA class to identify seasonal patterns in sales, such as peaks during Christmas or Easter.
# It visualizes the monthly sales distribution using a boxplot.


In [None]:
# Perform correlation analysis between sales and customers using the EDA class
eda.correlation_analysis(eda.train[['Sales', 'Customers']])

# Document the purpose of this cell
# This cell calls the correlation_analysis function from the EDA class to investigate the correlation between sales and customers.
# It visualizes the correlation using a heatmap.


In [None]:
# Label encode columns and get the correlation matrix using the EDA class
correlation_matrix = eda.label_encode_columns()

# Display correlation matrix
correlation_matrix

# Document the purpose of this cell
# This cell calls the label_encode_columns function from the EDA class to label encode categorical columns ('StoreType', 'Assortment', and 'PromoInterval') and calculate the correlation matrix.
# It then displays the correlation matrix to analyze the relationships between all columns.


In [None]:
# Label encode columns and get the correlation matrix using the EDA class
correlation_matrix = eda.label_encode_columns()

# Visualize the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation between features")
plt.show()

# Document the purpose of this cell
# This cell calls the label_encode_columns function from the EDA class to label encode categorical columns and calculate the correlation matrix.
# It then visualizes the correlation matrix using a heatmap to analyze the relationships between all features.


In [None]:
# Analyze the effect of promotions on sales using the EDA class
eda.promo_analysis()

# Document the purpose of this cell
# This cell calls the promo_analysis function from the EDA class to analyze how promotions impact sales and customer behavior.
# It visualizes the average sales during promotions vs no promotions using a bar plot.


In [None]:
# Analyze the effect of promos on sales using the EDA class
eda.promo2_analysis()

# Document the purpose of this cell
# This cell calls the promo2_analysis function from the EDA class to analyze the effect of Promo2 on CompetitionDistance.
# It prints the average CompetitionDistance for stores with and without Promo2.


In [None]:
# Visualize the distribution of Promo2 using the EDA class
eda.promo2_distribution_analysis()

# Document the purpose of this cell
# This cell calls the promo2_distribution_analysis function from the EDA class to visualize the distribution of Promo2.
# The countplot helps in understanding the number of stores that participate in Promo2.


In [None]:
# Visualize the distribution of Promo2 using the EDA class
eda.promo2_distribution_analysis()

# Document the purpose of this cell
# This cell calls the promo2_distribution_analysis function from the EDA class to visualize the distribution of Promo2.
# The countplot helps in understanding the number of stores that participate in Promo2.


In [None]:
# Analyze the effect of assortment types on sales using the EDA class
eda.assortment_sales_analysis()

# Document the purpose of this cell
# This cell calls the assortment_sales_analysis function from the EDA class to check the impact of different store assortments on sales.
# It visualizes the average sales by assortment type using a boxplot.
