# Imports

In [None]:
!pip install -r requirements.txt

In [None]:
import pandas as pd
import numpy as np
import os
import zipfile
import getpass

 # DOWNLOAD AND UNZIP RAW DATA 

In [None]:
DATA_DIR = 'data/raw'
COMPETITION_NAME = 'walmart-recruiting-store-sales-forecasting'

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

if not os.path.exists(os.path.join(DATA_DIR, 'train.csv')):
    print("Raw data not found. Attempting to download from Kaggle...")
    
    # Universal Authentication Block
    try:
        # This works on a local machine where kaggle.json is in ~/.kaggle/
        from kaggle.api.kaggle_api_extended import KaggleApi
        api = KaggleApi()
        api.authenticate()
        print("Kaggle API authenticated successfully (Local).")
    except (OSError, ImportError):
        # This block will be triggered in Google Colab
        print("Local authentication failed. Attempting Colab setup...")
        try:
            from google.colab import files
            # Prompt user to upload kaggle.json
            if not os.path.exists('kaggle.json'):
                 print("Please upload your kaggle.json file.")
                 files.upload()
            
            # Create Kaggle directory and move the file
            if not os.path.exists(os.path.expanduser('~/.kaggle')):
                os.makedirs(os.path.expanduser('~/.kaggle'))
            os.rename('kaggle.json', os.path.expanduser('~/.kaggle/kaggle.json'))
            os.chmod(os.path.expanduser('~/.kaggle/kaggle.json'), 0o600)
            
            from kaggle.api.kaggle_api_extended import KaggleApi
            api = KaggleApi()
            api.authenticate()
            print("Kaggle API authenticated successfully (Colab).")
        except Exception as e:
            print(f"Colab authentication failed. Error: {e}")
            api = None

    # Download and Unzip if authentication was successful
    if api:
        print(f"Downloading data for competition '{COMPETITION_NAME}'...")
        api.competition_download_files(COMPETITION_NAME, path=DATA_DIR, quiet=True)
        
        master_zip_path = os.path.join(DATA_DIR, f'{COMPETITION_NAME}.zip')
        with zipfile.ZipFile(master_zip_path, 'r') as z:
            z.extractall(DATA_DIR)
        for item in ['train.csv.zip', 'test.csv.zip', 'features.csv.zip']:
            with zipfile.ZipFile(os.path.join(DATA_DIR, item), 'r') as z:
                z.extractall(DATA_DIR)
        print("Data successfully downloaded and unzipped.")
    else:
        print("FATAL: Could not authenticate with Kaggle. Cannot proceed.")
else:
    print("Raw data already exists. Skipping download.")

Kaggle API authenticated successfully.
Data files already exist. Skipping download.


# LOAD, MERGE, AND PROCESS DATA

In [None]:
try:
    from src.preprocessing import advanced_feature_engineering
except ImportError:
    print("ERROR: Could not import 'advanced_feature_engineering'.")
    print("Please ensure you are running this notebook from your project's root directory and that 'src/preprocessing.py' exists.")
    exit()

# Load raw data from the 'data/raw' folder
train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
features_df = pd.read_csv(os.path.join(DATA_DIR, 'features.csv'))
stores_df = pd.read_csv(os.path.join(DATA_DIR, 'stores.csv'))

# Merge data
raw_train_data = train_df.merge(features_df, on=['Store', 'Date', 'IsHoliday'], how='left')
raw_train_data = raw_train_data.merge(stores_df, on='Store', how='left')

# Apply the advanced feature engineering
print("Applying feature engineering...")
train_processed = advanced_feature_engineering(raw_train_data)
print("Processing complete.")

# SAVE PROCESSED DATA

In [None]:
PROCESSED_DIR = 'data/processed'
if not os.path.exists(PROCESSED_DIR):
    os.makedirs(PROCESSED_DIR)

# Save the train set
train_processed.to_csv(os.path.join(PROCESSED_DIR, 'train_processed_final.csv'), index=False)

# Process and save the test set
test_df = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
raw_test_data = test_df.merge(features_df, on=['Store', 'Date', 'IsHoliday'], how='left')
raw_test_data = raw_test_data.merge(stores_df, on='Store', how='left')
# Add a dummy 'Weekly_Sales' column for the feature engineering function to work
raw_test_data['Weekly_Sales'] = 0 
test_processed = advanced_feature_engineering(raw_test_data)
test_processed.drop('Weekly_Sales', axis=1, inplace=True) # Drop the dummy column
test_processed.to_csv(os.path.join(PROCESSED_DIR, 'test_processed_final.csv'), index=False)

print(f"Final processed datasets have been saved to the '{PROCESSED_DIR}' directory.")
print("You are now ready to run your modeling notebooks.")