# 02 - Retail Data Study.ipynb
## Objective: Study the Retail Dataset

### Import libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ydata_profiling as pp
from ydata_profiling import ProfileReport
from feature_engine.discretisation import ArbitraryDiscretiser
import plotly.express as px
import numpy as np

### Load Data

In [None]:
data_path = "outputs/merged_data.csv"
df = pd.read_csv(data_path)

### Convert "Store" and "Dept" into categorical data types, and Date into datetime

In [None]:
df['Store'] = df['Store'].astype('category')
df['Dept'] = df['Dept'].astype('category')
df['Date'] = pd.to_datetime(df['Date'])

### Generate Pandas Profile Report

In [None]:
profile = ProfileReport(df, title="Retail Data Profile Report", explorative=True)
profile.to_notebook_iframe()  # Display in Jupyter Notebook

### Correlation Study
Pearson correlation including NA values

In [None]:
pearson_corr = df.corr(method='pearson')
print("Pearson Correlation (including NA values):\n", pearson_corr)

### Spearman correlation including NA values

In [None]:
spearman_corr = df.corr(method='spearman')
print("Spearman Correlation (including NA values):\n", spearman_corr)

### Select top 5 correlated features with the target variable "Weekly_Sales"

In [None]:
correlations = pearson_corr['Weekly_Sales'].abs().sort_values(ascending=False)
top_5_features = correlations[1:6].index.to_list()  # Exclude "Weekly_Sales" itself
print(f"Top 5 Correlated Features: {top_5_features}")

### Exploratory Data Analysis (EDA) on selected top 5 correlated features

In [None]:
df_eda = df.filter(top_5_features + ['Weekly_Sales'])
print("EDA Data Preview:\n", df_eda.head())

### Plot distributions of Weekly_Sales and top 5 correlated variables

In [None]:
sns.set_style('whitegrid')

def plot_distribution(df, feature):
    plt.figure(figsize=(10, 6))
    if df[feature].dtype == 'object':
        sns.boxplot(x=df[feature], y=df['Weekly_Sales'])
    else:
        sns.scatterplot(x=df[feature], y=df['Weekly_Sales'])
    plt.title(f"Distribution of {feature} vs Weekly_Sales")
    plt.show()

for feature in top_5_features:
    plot_distribution(df_eda, feature)

### Pie chart of "Weekly_Sales" by Store Type

In [None]:
sales_by_store_type = df.groupby('Type')['Weekly_Sales'].sum().reset_index()
fig = px.pie(sales_by_store_type, values='Weekly_Sales', names='Type', title="Weekly Sales by Store Type")
fig.show()

In [None]:
### Pie chart of "Weekly_Sales" by Store

In [None]:
sales_by_store = df.groupby('Store')['Weekly_Sales'].sum().reset_index()
fig = px.pie(sales_by_store, values='Weekly_Sales', names='Store', title="Weekly Sales by Store")
fig.show()

### Parallel Plot of top 5 features + Weekly_Sales

In [None]:
fig = px.parallel_coordinates(df_eda, color="Weekly_Sales",
                              dimensions=top_5_features,
                              color_continuous_scale=px.colors.diverging.Tealrose,
                              title="Parallel Plot: Top 5 Correlated Features vs Weekly_Sales")
fig.show()

In [None]:
### Conclusion

In [None]:
print("Conclusions: ")
print(f"The top 5 features most correlated with Weekly Sales are {top_5_features}.")
print("We have visualized their relationships with the target variable and conducted distribution analysis.")
print("There are many missing values which should be handled.")