# 2. การประมวลผลข้อมูล (Data Preprocessing)
## ขั้นตอนการประมวลผลข้อมูลสำหรับ Crypto Trading

### เป้าหมาย:
- ทำความสะอาดข้อมูล
- คำนวณ Technical Indicators
- Normalize ข้อมูล
- แบ่งข้อมูลเป็น Training/Validation/Testing
- บันทึกข้อมูลที่ประมวลผลแล้ว

## Cell 1: Import Libraries และโหลดข้อมูลดิบ

In [None]:
import sys
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle
from datetime import datetime
import ta  # Technical Analysis library
from sklearn.preprocessing import MinMaxScaler

# Import config
from config import *

# Setup directories
RAW_DIR = "raw_data"
PROCESSED_DIR = "processed_data"

if not os.path.exists(PROCESSED_DIR):
    os.makedirs(PROCESSED_DIR)

print("📁 Setup directories completed")
print(f"📊 Starting Data Preprocessing Process")

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

## Cell 2: โหลดและทำความสะอาดข้อมูล

In [None]:
def load_raw_data():
    """
    โหลดข้อมูลดิบจากไฟล์ CSV
    """
    print("📂 Loading raw data...")
    
    # หาไฟล์ล่าสุดใน raw_data
    csv_files = [f for f in os.listdir(RAW_DIR) if f.endswith('.csv')]
    if not csv_files:
        raise FileNotFoundError("No CSV files found in raw_data directory")
    
    latest_file = max(csv_files, key=lambda x: os.path.getctime(os.path.join(RAW_DIR, x)))
    file_path = os.path.join(RAW_DIR, latest_file)
    
    # โหลดข้อมูล
    df = pd.read_csv(file_path)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    print(f"✅ Loaded {len(df)} rows from {latest_file}")
    print(f"📅 Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
    print(f"📊 Symbols: {df['tic'].unique()}")
    
    return df

def clean_data(df):
    """
    ทำความสะอาดข้อมูล
    """
    print("🧹 Cleaning data...")
    
    # ลบข้อมูลซ้ำ
    initial_len = len(df)
    df = df.drop_duplicates()
    print(f"  Removed {initial_len - len(df)} duplicate rows")
    
    # เรียงลำดับตามเวลา
    df = df.sort_values(['timestamp', 'tic'])
    df = df.reset_index(drop=True)
    
    # ตรวจสอบ missing values
    missing_values = df.isnull().sum()
    if missing_values.any():
        print("\n⚠️ Found missing values:")
        print(missing_values[missing_values > 0])
        
        # แก้ไข missing values
        for col in df.columns:
            if df[col].isnull().any():
                if col in ['open', 'high', 'low', 'close']:
                    # Forward fill สำหรับราคา
                    df[col] = df.groupby('tic')[col].fillna(method='ffill')
                elif col == 'volume':
                    # แทนที่ด้วย 0 สำหรับ volume
                    df[col] = df[col].fillna(0)
    
    # ตรวจสอบค่าผิดปกติ
    numeric_columns = ['open', 'high', 'low', 'close', 'volume']
    for col in numeric_columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        if len(outliers) > 0:
            print(f"\n⚠️ Found {len(outliers)} outliers in {col}")
            # แทนที่ค่าผิดปกติด้วยค่าใกล้เคียง
            df.loc[df[col] < lower_bound, col] = lower_bound
            df.loc[df[col] > upper_bound, col] = upper_bound
    
    print("✅ Data cleaning completed")
    return df

# โหลดและทำความสะอาดข้อมูล
df = load_raw_data()
df = clean_data(df)

print(f"\n📊 Cleaned data summary:")
print(f"  Total rows: {len(df)}")
print(f"  Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"  Symbols: {df['tic'].unique()}")

## Cell 3: คำนวณ Technical Indicators

In [None]:
def calculate_technical_indicators(df):
    """
    คำนวณ Technical Indicators สำหรับแต่ละ symbol
    """
    print("📈 Calculating technical indicators...")
    
    # สร้าง DataFrame ใหม่
    processed_df = pd.DataFrame()
    
    # คำนวณ indicators สำหรับแต่ละ symbol
    for symbol in df['tic'].unique():
        print(f"\nProcessing {symbol}...")
        symbol_data = df[df['tic'] == symbol].copy()
        
        # Trend Indicators
        symbol_data['sma_20'] = ta.trend.sma_indicator(symbol_data['close'], window=20)
        symbol_data['sma_50'] = ta.trend.sma_indicator(symbol_data['close'], window=50)
        symbol_data['ema_20'] = ta.trend.ema_indicator(symbol_data['close'], window=20)
        symbol_data['macd'] = ta.trend.macd_diff(symbol_data['close'])
        
        # Momentum Indicators
        symbol_data['rsi'] = ta.momentum.rsi(symbol_data['close'], window=14)
        symbol_data['stoch'] = ta.momentum.stoch(symbol_data['high'], symbol_data['low'], symbol_data['close'])
        symbol_data['williams_r'] = ta.momentum.williams_r(symbol_data['high'], symbol_data['low'], symbol_data['close'])
        
        # Volatility Indicators
        symbol_data['bb_high'] = ta.volatility.bollinger_hband(symbol_data['close'])
        symbol_data['bb_low'] = ta.volatility.bollinger_lband(symbol_data['close'])
        symbol_data['atr'] = ta.volatility.average_true_range(symbol_data['high'], symbol_data['low'], symbol_data['close'])
        
        # Volume Indicators
        symbol_data['obv'] = ta.volume.on_balance_volume(symbol_data['close'], symbol_data['volume'])
        symbol_data['vwap'] = ta.volume.volume_weighted_average_price(symbol_data['high'], symbol_data['low'], symbol_data['close'], symbol_data['volume'])
        
        # เพิ่มข้อมูลที่คำนวณแล้ว
        processed_df = pd.concat([processed_df, symbol_data])
        
        print(f"✅ Added {len(symbol_data)} rows with indicators")
    
    # เรียงลำดับข้อมูล
    processed_df = processed_df.sort_values(['timestamp', 'tic']).reset_index(drop=True)
    
    print("\n✅ Technical indicators calculation completed")
    print(f"📊 Total indicators: {len(processed_df.columns) - 6}")  # ลบ 6 คอลัมน์พื้นฐาน
    
    return processed_df

# คำนวณ technical indicators
df = calculate_technical_indicators(df)

print(f"\n📊 Indicators summary:")
print(f"  Total columns: {len(df.columns)}")
print(f"  Indicators: {[col for col in df.columns if col not in ['timestamp', 'tic', 'open', 'high', 'low', 'close', 'volume']]}")

## Cell 4: Normalize ข้อมูล

In [None]:
def normalize_data(df):
    """
    Normalize ข้อมูลราคาและ indicators
    """
    print("📊 Normalizing data...")
    
    # แยกข้อมูลที่ไม่ต้อง normalize
    non_numeric_cols = ['timestamp', 'tic']
    numeric_cols = [col for col in df.columns if col not in non_numeric_cols]
    
    # สร้าง scaler
    scaler = MinMaxScaler()
    
    # Normalize ข้อมูลสำหรับแต่ละ symbol
    normalized_data = pd.DataFrame()
    
    for symbol in df['tic'].unique():
        print(f"\nNormalizing {symbol}...")
        symbol_data = df[df['tic'] == symbol].copy()
        
        # Normalize ข้อมูลตัวเลข
        numeric_data = symbol_data[numeric_cols]
        normalized_numeric = scaler.fit_transform(numeric_data)
        
        # สร้าง DataFrame ใหม่
        normalized_symbol_data = pd.DataFrame(normalized_numeric, columns=numeric_cols)
        normalized_symbol_data[non_numeric_cols] = symbol_data[non_numeric_cols]
        
        # เพิ่มข้อมูลที่ normalize แล้ว
        normalized_data = pd.concat([normalized_data, normalized_symbol_data])
        
        print(f"✅ Normalized {len(symbol_data)} rows")
    
    # เรียงลำดับข้อมูล
    normalized_data = normalized_data.sort_values(['timestamp', 'tic']).reset_index(drop=True)
    
    print("\n✅ Data normalization completed")
    return normalized_data

# Normalize ข้อมูล
df = normalize_data(df)

print(f"\n📊 Normalized data summary:")
print(f"  Total rows: {len(df)}")
print(f"  Total columns: {len(df.columns)}")
print(f"  Numeric columns: {[col for col in df.columns if col not in ['timestamp', 'tic']]}")

## Cell 5: แบ่งข้อมูลและบันทึก

In [None]:
def split_data(df):
    """
    แบ่งข้อมูลเป็น Training/Validation/Testing
    """
    print("📊 Splitting data...")
    
    # แบ่งข้อมูลตามสัดส่วน 70:15:15
    total_len = len(df)
    train_size = int(total_len * 0.7)
    val_size = int(total_len * 0.15)
    
    train_df = df.iloc[:train_size].reset_index(drop=True)
    val_df = df.iloc[train_size:train_size + val_size].reset_index(drop=True)
    test_df = df.iloc[train_size + val_size:].reset_index(drop=True)
    
    print(f"✅ Data split completed:")
    print(f"  Training set: {len(train_df)} rows ({len(train_df)/total_len*100:.1f}%)")
    print(f"  Validation set: {len(val_df)} rows ({len(val_df)/total_len*100:.1f}%)")
    print(f"  Testing set: {len(test_df)} rows ({len(test_df)/total_len*100:.1f}%)")
    
    return train_df, val_df, test_df

def save_processed_data(df, train_df, val_df, test_df):
    """
    บันทึกข้อมูลที่ประมวลผลแล้ว
    """
    print("💾 Saving processed data...")
    
    # สร้าง timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # บันทึกข้อมูลทั้งหมด
    pickle_file = os.path.join(PROCESSED_DIR, f"processed_crypto_data_{timestamp}.pkl")
    with open(pickle_file, 'wb') as f:
        pickle.dump(df, f)
    print(f"✅ Saved full data to {pickle_file}")
    
    # บันทึกข้อมูลแยกส่วน
    train_file = os.path.join(PROCESSED_DIR, f"train_data_{timestamp}.pkl")
    val_file = os.path.join(PROCESSED_DIR, f"val_data_{timestamp}.pkl")
    test_file = os.path.join(PROCESSED_DIR, f"test_data_{timestamp}.pkl")
    
    with open(train_file, 'wb') as f:
        pickle.dump(train_df, f)
    with open(val_file, 'wb') as f:
        pickle.dump(val_df, f)
    with open(test_file, 'wb') as f:
        pickle.dump(test_df, f)
    
    print(f"✅ Saved split data to:")
    print(f"  - {train_file}")
    print(f"  - {val_file}")
    print(f"  - {test_file}")
    
    # บันทึกไฟล์ล่าสุด
    latest_files = {
        'full': os.path.join(PROCESSED_DIR, "processed_crypto_data.pkl"),
        'train': os.path.join(PROCESSED_DIR, "train_data.pkl"),
        'val': os.path.join(PROCESSED_DIR, "val_data.pkl"),
        'test': os.path.join(PROCESSED_DIR, "test_data.pkl")
    }
    
    for key, file_path in latest_files.items():
        if os.path.exists(file_path):
            os.remove(file_path)
        
        if key == 'full':
            df.to_csv(file_path.replace('.pkl', '.csv'), index=False)
        else:
            eval(f"{key}_df").to_csv(file_path.replace('.pkl', '.csv'), index=False)
    
    print(f"\n✅ Saved latest files to:")
    for key, file_path in latest_files.items():
        print(f"  - {file_path.replace('.pkl', '.csv')}")

# แบ่งข้อมูล
train_df, val_df, test_df = split_data(df)

# บันทึกข้อมูล
save_processed_data(df, train_df, val_df, test_df)

print(f"\n   Data preprocessing completed successfully!")
print(f"📊 Final data summary:")
print(f"  Total rows: {len(df)}")
print(f"  Total columns: {len(df.columns)}")
print(f"  Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"  Symbols: {df['tic'].unique()}")