# Imports

In [0]:
!pip install -r requirements.txt

In [0]:
import pandas as pd
import numpy as np
import os
import zipfile
import getpass

--- SECTION 1: KAGGLE SETUP FOR DATABRICKS ---

In [0]:
os.environ['KAGGLE_USERNAME'] = "elenegabeskiria"
os.environ['KAGGLE_KEY'] = "fbc7c735b9a28fa8d6fe48b75ebe1d6b"

DATA_DIR = '/dbfs/FileStore/walmart_project/data/raw'
COMPETITION_NAME = 'walmart-recruiting-store-sales-forecasting'

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR, exist_ok=True)

if not os.path.exists(os.path.join(DATA_DIR, 'train.csv')):
    print("Raw data not found. Downloading from Kaggle...")
    from kaggle.api.kaggle_api_extended import KaggleApi
    
    api = KaggleApi()
    api.authenticate()
    
    api.competition_download_files(COMPETITION_NAME, path=DATA_DIR, quiet=True)

    master_zip_path = os.path.join(DATA_DIR, f'{COMPETITION_NAME}.zip')
    with zipfile.ZipFile(master_zip_path, 'r') as z:
        z.extractall(DATA_DIR)
    for item in ['train.csv.zip', 'test.csv.zip', 'features.csv.zip']:
        with zipfile.ZipFile(os.path.join(DATA_DIR, item), 'r') as z:
            z.extractall(DATA_DIR)
    print("Data successfully downloaded and unzipped to DBFS.")
else:
    print("Raw data already exists in DBFS. Skipping download.")

# LOAD, MERGE, AND PROCESS DATA

In [0]:
from src.preprocessing import advanced_feature_engineering

# Load raw data from DBFS
train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
features_df = pd.read_csv(os.path.join(DATA_DIR, 'features.csv'))
stores_df = pd.read_csv(os.path.join(DATA_DIR, 'stores.csv'))

# Merge data
raw_train_data = train_df.merge(features_df, on=['Store', 'Date', 'IsHoliday'], how='left')
raw_train_data = raw_train_data.merge(stores_df, on='Store', how='left')

# Apply the advanced feature engineering
print("Applying feature engineering...")
train_processed = advanced_feature_engineering(raw_train_data)
print("Processing complete.")

# SAVE PROCESSED DATA TO DBFS

In [0]:
PROCESSED_DIR = '/dbfs/FileStore/walmart_project/data/processed'
if not os.path.exists(PROCESSED_DIR):
    os.makedirs(PROCESSED_DIR, exist_ok=True)

# Save the train set
train_processed.to_csv(os.path.join(PROCESSED_DIR, 'train_processed_final.csv'), index=False)

# Process and save the test set
test_df = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
raw_test_data = test_df.merge(features_df, on=['Store', 'Date', 'IsHoliday'], how='left')
raw_test_data = raw_test_data.merge(stores_df, on='Store', how='left')
test_processed = advanced_feature_engineering(raw_test_data)
test_processed.to_csv(os.path.join(PROCESSED_DIR, 'test_processed_final.csv'), index=False)

print(f"Final processed datasets have been saved to the '{PROCESSED_DIR}' directory in DBFS.")
