# Exploratory Data Analysis (EDA) for Rossmann Pharmaceutical Data Analysis
## What this notebook does is:

- Load the data
- Perform basic statistics on the data

In [1]:
# Import necessary libraries
import sys
import os
import pandas as pd

In [None]:
# Get the current working directory of the project
current_dir = os.getcwd()
print(current_dir)

# Get the parent directory
parent_dir = os.path.dirname(current_dir)
print(parent_dir)

# Insert the path to the parent directory
sys.path.insert(0, parent_dir)

# # Insert the path to the Scripts directory
# sys.path.insert(0, os.path.join(parent_dir, 'Scripts'))

# print(sys.path)

In [3]:
# Import the scripts
from scripts.data_cleaning import handle_missing_values, remove_outliers
from scripts.feature_engineering import add_holiday_flag, create_time_features
from scripts.analysis import sales_correlation, promo_effect
from scripts.visualizations import plot_sales_distribution, plot_correlation

In [None]:
# Load data
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')


In [None]:
# Data cleaning
train = handle_missing_values(train)
train = remove_outliers(train, 'Sales')

In [6]:
# Feature engineering
holidays = ['2015-12-25', '2015-01-01']
train = add_holiday_flag(train, holidays)
train = create_time_features(train)

In [None]:
# Exploratory analysis
print("Sales-Customer Correlation:", sales_correlation(train))
promo_sales, no_promo_sales = promo_effect(train)
print(f"Promo Sales: {promo_sales}, No Promo Sales: {no_promo_sales}")

In [None]:

# Visualizations
plot_sales_distribution(train)
plot_correlation(train)