In [1]:
# train_and_save_model.ipynb (FINAL PRODUCTION VERSION)

import pandas as pd
from joblib import dump
import os
import numpy as np
from datetime import timedelta
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings('ignore')
# BUMP DISPLAY LIMITS
pd.set_option('display.max_rows', 5000) 
pd.set_option('display.max_columns', 50) 

# --- Cell 1: Import Libraries and Setup ---

DATA_FILE_PATH = 'data/Training_Data_Final.xlsx' 
MODEL_PATH = 'models/inventory_model.pkl'
# Features ที่ใช้ทำนาย (Usage_Qty ราย SKU เป็น Target)
X_features = ['day_index', 'month_num', 'year_num', 'Patient_Count', 'lag1', 'Total_SKU_Usage'] 

os.makedirs('models', exist_ok=True)
os.makedirs('data', exist_ok=True)

print("--- 1. PROJECT SETUP ---")
print(f"Data file expected at: {DATA_FILE_PATH}")
print(f"Model will be saved to: {MODEL_PATH}")

# --- Cell 2: โหลดและเตรียมข้อมูล (Prediction Target: Usage_Qty ราย SKU) ---

df_train = pd.DataFrame()
base_date = pd.to_datetime('2024-01-01')

try:
    # 1. โหลดไฟล์ Training Data Final
    df_raw = pd.read_excel(DATA_FILE_PATH) 
    
    # 2. Cleansing และ Feature Engineering
    df_raw.columns = df_raw.columns.astype(str)
    df_raw['Usage_Qty'] = pd.to_numeric(df_raw['Usage_Qty'], errors='coerce') 
    
    # กำหนดคอลัมน์ที่จำเป็นต่อการเทรน (ไม่รวม SOH)
    subset_for_dropna = ['Date', 'Usage_Qty', 'Patient_Count', 'SKU', 'Total_SKU_Usage', 'Lead_Time_Days', 'Safety_Stock_Qty', 'Unit_Cost']
    
    df_raw = df_raw.dropna(subset=subset_for_dropna)
    df_raw = df_raw[df_raw['Usage_Qty'] > 0]
    
    print("\n--- 2A. RAW DATA CLEANED ---")
    print(f"Total cleaned rows (Pre-Aggregation): {df_raw.shape[0]}")
    
    # 3. *** FINAL ROBUSTNESS FIX: ENFORCE UNIQUENESS & SUM USAGE ***
    # ถ้ามี [Date + SKU] ซ้ำกัน ให้รวมยอด Usage_Qty เข้าไป
    agg_dict = {
        'Usage_Qty': 'sum',
        'Patient_Count': 'first',
        'Total_SKU_Usage': 'first',
        'Lead_Time_Days': 'first',
        'Safety_Stock_Qty': 'first',
        'Unit_Cost': 'first',
    }

    # Group By Date และ SKU เพื่อรวมยอด Usage_Qty ที่ซ้ำกัน
    df_raw = df_raw.groupby(['Date', 'SKU'], as_index=False).agg(agg_dict)
    
    # 4. สร้าง Time Features
    base_date = df_raw['Date'].min()
    df_raw['day_index'] = (df_raw['Date'] - base_date).dt.days
    df_raw['month_num'] = df_raw['Date'].dt.month
    df_raw['year_num'] = df_raw['Date'].dt.year

    # 5. สร้าง Lagged Features (ยอดใช้เดือนก่อน)
    df_raw['lag1'] = df_raw.groupby('SKU')['Usage_Qty'].shift(1)
    
    # 6. *** FINAL PREPARATION ***
    df_train = df_raw.dropna(subset=['lag1'])

    # 7. บันทึก Metadata สุดท้าย
    final_item_list = df_raw['SKU'].unique().tolist()
    max_date = df_train['Date'].max()
    
    if df_train.empty:
        raise ValueError("Data frame is empty after processing. Cannot train model.")
        
    print("\n--- 2B. FINAL TRAINING DATA SAMPLE (INDIVIDUAL SKU) ---")
    
    # [1] แสดงตัวอย่างข้อมูล 5 แถวแรก (Sample Data Head)
    print("\n[--- SAMPLE DATA HEAD ---]")
    print(df_train[['Date', 'SKU', 'Usage_Qty', 'Patient_Count', 'Total_SKU_Usage', 'lag1']].head())

    # [2] แสดงสรุปขนาดข้อมูล (Final Metrics)
    print("\n[--- FINAL DATA SHAPE ---]")
    print(f"Final training shape: {df_train.shape}")
    print(f"Base Date for modeling: {base_date.strftime('%Y-%m-%d')}")
    print(f"Max Date for training: {max_date.strftime('%Y-%m-%d')}")

    print(f"\n✅ Loaded and transformed data successfully from {DATA_FILE_PATH}")

except Exception as e:
    print(f"\n⚠️ **FATAL DATA ERROR** ({type(e).__name__}: {e}). Using Mock Data for training instead.")
    
    # โค้ด Mock Data สำรอง (เพื่อให้ API ทำงานได้)
    np.random.seed(42)
    days = pd.date_range(start='2024-01-01', periods=100)
    base_date = pd.to_datetime('2024-01-01')
    max_date = days.max()
    final_item_list = ['10300', '42132']
    df_mock = pd.DataFrame({
        'Date': np.repeat(days, 2),
        'SKU': np.tile(final_item_list, 100),
        'Usage_Qty': np.random.randint(10, 50, size=200) + np.arange(200) * 0.1,
        'Patient_Count': np.repeat(1500 + np.random.randint(-100, 100, size=100), 2)
    }).sort_values(by='Date')
    df_mock['Total_SKU_Usage'] = df_mock.groupby('Date')['Usage_Qty'].transform('sum')
    df_mock['day_index'] = (df_mock['Date'] - df_mock['Date'].min()).dt.days
    df_mock['month_num'] = df_mock['Date'].dt.month
    df_mock['year_num'] = df_mock['Date'].dt.year
    df_mock['lag1'] = df_mock.groupby('SKU')['Usage_Qty'].shift(1)
    df_train = df_mock.dropna()

# --- Cell 3: เทรน Model (XGBoost Regressor) ---

X_train = df_train[X_features]
y_train = df_train['Usage_Qty'] # Target คือยอดใช้ราย SKU

model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.05, n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

r_squared = model.score(X_train, y_train)
print(f"Model Trained. R-squared (simple): {r_squared:.4f}") 

# --- Cell 4: บันทึก Model และ Metadata สำคัญ ---

model_metadata = {
    'model': model,
    'base_date': base_date,
    'max_date': max_date,
    'item_list': final_item_list,
    'features': X_features
}
dump(model_metadata, MODEL_PATH)

print(f"Model and Metadata saved successfully to {MODEL_PATH}")

--- 1. PROJECT SETUP ---
Data file expected at: data/Training_Data_Final.xlsx
Model will be saved to: models/inventory_model.pkl

--- 2A. RAW DATA CLEANED ---
Total cleaned rows (Pre-Aggregation): 1331

--- 2B. FINAL TRAINING DATA SAMPLE (INDIVIDUAL SKU) ---

[--- SAMPLE DATA HEAD ---]
         Date             SKU  Usage_Qty  Patient_Count  Total_SKU_Usage  lag1
40 2023-02-01           10300          1           1193               47   1.0
41 2023-02-01           10304          1           1193               47   1.0
44 2023-02-01  42131611000009          1           1193               47   1.0
46 2023-02-01  42132205000055          1           1193               47   1.0
47 2023-02-01  42132205000057          1           1193               47   1.0

[--- FINAL DATA SHAPE ---]
Final training shape: (1103, 12)
Base Date for modeling: 2023-01-01
Max Date for training: 2025-09-01

✅ Loaded and transformed data successfully from data/Training_Data_Final.xlsx
Model Trained. R-squared (simp