In [4]:
# train_and_save_model.ipynb (FINAL VERSION - Complete Code)

import pandas as pd
from joblib import dump
import os
import numpy as np
from datetime import timedelta
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings('ignore')

# *** แก้ไขและเพิ่มค่าการแสดงผลให้สูงกว่า 1,105 แถว ***
pd.set_option('display.max_rows', 5000) 
pd.set_option('display.max_columns', 50) # เพิ่มเป็น 50 คอลัมน์ เผื่อไว้
# ************************************************

# --- Cell 1: Import Libraries and Setup ---

DATA_FILE_PATH = 'data/Training_Data_Final.xlsx' 
MODEL_PATH = 'models/inventory_model.pkl'
X_features = ['day_index', 'month_num', 'year_num', 'Patient_Count', 'lag1', 'Total_SKU_Usage'] 

os.makedirs('models', exist_ok=True)
os.makedirs('data', exist_ok=True)

print("--- 1. PROJECT SETUP ---")
print(f"Data file expected at: {DATA_FILE_PATH}")
print(f"Model will be saved to: {MODEL_PATH}")

# --- Cell 2: โหลดและเตรียมข้อมูล (Prediction Target: Usage_Qty ราย SKU) ---

df_train = pd.DataFrame()
base_date = pd.to_datetime('2024-01-01')

try:
    # 1. โหลดไฟล์ Training Data Final ที่รวมทุกอย่างแล้ว
    df_raw = pd.read_excel(DATA_FILE_PATH) 
    
    # 2. Cleansing และ Feature Engineering
    df_raw.columns = df_raw.columns.astype(str)
    df_raw['Usage_Qty'] = pd.to_numeric(df_raw['Usage_Qty'], errors='coerce') 
    
    df_raw = df_raw.dropna(subset=['Date', 'Usage_Qty', 'Patient_Count', 'SKU', 'Total_SKU_Usage'])
    df_raw = df_raw[df_raw['Usage_Qty'] > 0]
    
    print("\n--- 2A. RAW DATA CLEANED ---")
    print(f"Total cleaned rows: {df_raw.shape[0]}")
    
    # 3. สร้าง Time Features
    base_date = df_raw['Date'].min()
    df_raw['day_index'] = (df_raw['Date'] - base_date).dt.days
    df_raw['month_num'] = df_raw['Date'].dt.month
    df_raw['year_num'] = df_raw['Date'].dt.year

    # 4. สร้าง Lagged Features (ยอดใช้เดือนก่อน)
    df_raw['lag1'] = df_raw.groupby('SKU')['Usage_Qty'].shift(1)
    
    # 5. *** FINAL PREPARATION ***
    df_train = df_raw.dropna(subset=['lag1'])

    # 6. บันทึก Metadata สุดท้าย
    final_item_list = df_train['SKU'].unique().tolist()
    max_date = df_train['Date'].max()
    
    if df_train.empty:
        raise ValueError("Data frame is empty after processing. Cannot train model.")
        
    print("\n--- 2B. FINAL TRAINING DATA SAMPLE (INDIVIDUAL SKU) ---")
    
    # *** แสดงผลทั้งหมดตามที่ร้องขอ ***
    print("\n[--- ALL TRAINING DATA ROWS ---]")
    print(df_train[['Date', 'SKU', 'Usage_Qty', 'Patient_Count', 'Total_SKU_Usage', 'lag1']])
    
    print("\n[--- FINAL DATA SHAPE ---]")
    print(f"Final training shape: {df_train.shape}")
    print(f"Base Date for modeling: {base_date.strftime('%Y-%m-%d')}")
    print(f"Max Date for training: {max_date.strftime('%Y-%m-%d')}")
    
    print(f"\n✅ Loaded and transformed data successfully from {DATA_FILE_PATH}")

except Exception as e:
    print(f"\n⚠️ **FATAL DATA ERROR** ({type(e).__name__}: {e}). Using Mock Data for training instead.")
    
    # โค้ด Mock Data สำรอง
    np.random.seed(42)
    days = pd.date_range(start='2024-01-01', periods=100)
    base_date = pd.to_datetime('2024-01-01')
    max_date = days.max()
    final_item_list = ['10300', '42132']
    df_mock = pd.DataFrame({
        'Date': np.repeat(days, 2),
        'SKU': np.tile(final_item_list, 100),
        'Usage_Qty': np.random.randint(10, 50, size=200) + np.arange(200) * 0.1,
        'Patient_Count': np.repeat(1500 + np.random.randint(-100, 100, size=100), 2)
    }).sort_values(by='Date')
    df_mock['Total_SKU_Usage'] = df_mock.groupby('Date')['Usage_Qty'].transform('sum')
    df_mock['day_index'] = (df_mock['Date'] - df_mock['Date'].min()).dt.days
    df_mock['month_num'] = df_mock['Date'].dt.month
    df_mock['year_num'] = df_mock['Date'].dt.year
    df_mock['lag1'] = df_mock.groupby('SKU')['Usage_Qty'].shift(1)
    df_train = df_mock.dropna()

# --- Cell 3: เทรน Model (XGBoost Regressor) ---
print("\n--- 3. MODEL TRAINING ---")

X_train = df_train[X_features]
y_train = df_train['Usage_Qty'] # Target คือยอดใช้ราย SKU

print(f"XGBoost features used: {X_features}")
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.05, n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

r_squared = model.score(X_train, y_train)
print(f"Model Trained. R-squared (simple): {r_squared:.4f}") 

# --- Cell 4: บันทึก Model และ Metadata สำคัญ ---
print("\n--- 4. SAVING MODEL ---")

model_metadata = {
    'model': model,
    'base_date': base_date,
    'max_date': max_date,
    'item_list': final_item_list,
    'features': X_features
}
dump(model_metadata, MODEL_PATH)

print(f"Model and Metadata saved successfully to {MODEL_PATH}")

--- 1. PROJECT SETUP ---
Data file expected at: data/Training_Data_Final.xlsx
Model will be saved to: models/inventory_model.pkl

⚠️ **FATAL DATA ERROR** (FileNotFoundError: [Errno 2] No such file or directory: 'data/Training_Data_Final.xlsx'). Using Mock Data for training instead.

--- 3. MODEL TRAINING ---
XGBoost features used: ['day_index', 'month_num', 'year_num', 'Patient_Count', 'lag1', 'Total_SKU_Usage']
X_train shape: (198, 6), y_train shape: (198,)
Model Trained. R-squared (simple): 0.9487

--- 4. SAVING MODEL ---
Model and Metadata saved successfully to models/inventory_model.pkl
