# Data Collection

## 1.1 Import Libraries


In [None]:
# Import necessary libraries
import os
import sys
import joblib
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
from joblib import dump
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Append the correct path for custom module imports
sys.path.append(os.path.abspath('../src'))

# Import custom modules
from preprocess import Preprocessor
from model import ModelTrainer


## 1.2 Import Warnings and Logging Libraries


In [None]:
# Warnings and logging
import warnings
import logging
warnings.filterwarnings("ignore")

# Append the correct path for custom module imports
sys.path.append(os.path.abspath('../src'))
sys.path.append(os.path.abspath('../data'))
sys.path.append(os.path.abspath('../notebook/models'))

# Optional custom logging (currently commented out)
# from custom_logging import info_logger, error_logger


## 1.3 Commented Out Imports for Future Reference


In [None]:
# from data_processing import load_data, clean_data, handle_missing_values
# from sale_analysis import (
#     plot_sales_distribution,
#     compare_sales_holidays,
#     seasonal_behavior,
#     correlation_analysis,
#     promo_effect,
#     effective_promo_deployment,
#     customer_behavior_trends,
#     weekday_openings,
#     assortment_type_impact,
#     competitor_distance_impact,
#     new_competitor_effects, 
#     plot_promo_distribution,
#     plot_sales_during_holidays,
#     plot_sales_customers_corr, 
#     plot_store_corr
# )


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


## 1.2 Load the Data


In [None]:
# Define the paths to the datasets
train_path = '/kaggle/input/rossmann-store-sales/train.csv'
test_path = '/kaggle/input/rossmann-store-sales/test.csv'
store_path = '/kaggle/input/rossmann-store-sales/store.csv'

# Initialize the Preprocessor class
preprocessor = Preprocessor(train_path, test_path, store_path)

# Load the datasets
train_data, test_data, store_data = preprocessor.load_data()


# Data Preprocessing

## 2.1 Summary of Dataset


In [None]:
# Summarize the dataset
preprocessor.summarize_data()


## 2.2 Descriptive Statistics for Training Data


In [None]:
# Display descriptive statistics for the training dataset
train_data.describe()


## 2.3 Data Types and Descriptive Statistics for Test and Store Data


In [None]:
# Display data types for the test dataset
print(test_data.dtypes)

# Display data types for the store dataset
print(store_data.dtypes)

# Display descriptive statistics for the store dataset
store_data.describe()


## 2.4 Data Cleaning

### 2.4.1 Checking Missing Values in Training Data


In [None]:
# Check for missing values in the training dataset
train_data.isna().sum()


### 2.4.2 Checking Missing Values in Testing Data


In [None]:
# Check for missing values in the testing dataset
test_data.isna().sum()


### 2.4.3 Checking Missing Values in Store Data


In [None]:
# Check for missing values in the store dataset
store_data.isnull().sum()
