In [None]:
# Template Notebook cho Data Preparation - Milestone 1
# Th√†nh vi√™n: [T√äN] - Dataset: [T√äN DATASET]

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

# Import utilities (sau khi ƒë√£ t·∫°o file)
import sys
sys.path.append('../src/')
from visualization_utils import create_standard_plots, plot_class_distribution, plot_split_distributions

# ==========================================
# 1. LOAD V√Ä KH√ÅM PH√Å D·ªÆ LI·ªÜU
# ==========================================

print("="*50)
print("1. LOAD V√Ä KH√ÅM PH√Å D·ªÆ LI·ªÜU")
print("="*50)

# TODO: Load dataset c·ªßa b·∫°n
# data = pd.read_csv('path/to/your/dataset.csv')

# Hi·ªÉn th·ªã th√¥ng tin c∆° b·∫£n
print("Dataset shape:", data.shape)
print("\nDataset info:")
print(data.info())
print("\nFirst 5 rows:")
print(data.head())

# Ki·ªÉm tra missing values
print("\nMissing values:")
print(data.isnull().sum())

# Th·ªëng k√™ m√¥ t·∫£
print("\nDescriptive statistics:")
print(data.describe())

# ==========================================
# 2. DATA PREPROCESSING
# ==========================================

print("\n" + "="*50)
print("2. DATA PREPROCESSING")
print("="*50)

# TODO: X·ª≠ l√Ω missing values n·∫øu c√≥
# data = data.dropna()  # ho·∫∑c fillna()

# TODO: X·ª≠ l√Ω categorical features (n·∫øu c√≥)
# V√≠ d·ª• cho Palmer Penguins:
# categorical_cols = ['sex', 'island']
# data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

# TODO: Chu·∫©n b·ªã features v√† target
# X = data.drop('target_column', axis=1)
# y = data['target_column']

print("Features shape:", X.shape)
print("Target shape:", y.shape)

# ==========================================
# 3. VISUALIZE CLASS DISTRIBUTION
# ==========================================

print("\n" + "="*50)
print("3. VISUALIZE CLASS DISTRIBUTION")
print("="*50)

# S·ª≠ d·ª•ng function chu·∫©n
dataset_name = "[T√äN DATASET C·ª¶A B·∫°N]"  # Thay ƒë·ªïi t√™n n√†y
target_col = "[T√äN C·ªòT TARGET]"        # Thay ƒë·ªïi t√™n n√†y

plot_class_distribution(data, target_col, f'{dataset_name} - Original Data',
                       f'../visualizations/{dataset_name.lower().replace(" ", "_")}/original_distribution.png')

# ==========================================
# 4. CHIA D·ªÆ LI·ªÜU THEO 4 T·ª∂ L·ªÜ
# ==========================================

print("\n" + "="*50)
print("4. CHIA D·ªÆ LI·ªÜU THEO 4 T·ª∂ L·ªÜ")
print("="*50)

# ƒê·ªãnh nghƒ©a c√°c t·ª∑ l·ªá split
split_ratios = {
    '40/60': 0.4,
    '60/40': 0.6, 
    '80/20': 0.8,
    '90/10': 0.9
}

# Dictionary ƒë·ªÉ l∆∞u c√°c split
data_splits = {}
target_splits = {}

# Th·ª±c hi·ªán split cho t·ª´ng t·ª∑ l·ªá
for ratio_name, train_size in split_ratios.items():
    print(f"\nSplitting data with ratio {ratio_name}...")
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        train_size=train_size,
        stratify=y,  # ƒê·∫£m b·∫£o stratified split
        random_state=42,
        shuffle=True
    )
    
    # L∆∞u v√†o dictionary
    data_splits[ratio_name] = (X_train, X_test)
    target_splits[ratio_name] = (y_train, y_test)
    
    # In th√¥ng tin v·ªÅ split
    print(f"Train size: {len(X_train)} ({len(X_train)/len(X)*100:.1f}%)")
    print(f"Test size: {len(X_test)} ({len(X_test)/len(X)*100:.1f}%)")
    print(f"Train class distribution:\n{y_train.value_counts()}")
    print(f"Test class distribution:\n{y_test.value_counts()}")

# ==========================================
# 5. VISUALIZE SPLIT DISTRIBUTIONS  
# ==========================================

print("\n" + "="*50)
print("5. VISUALIZE SPLIT DISTRIBUTIONS")
print("="*50)

# Chu·∫©n b·ªã data cho visualization
splits_for_viz = {}
for ratio_name in split_ratios.keys():
    y_train, y_test = target_splits[ratio_name]
    splits_for_viz[ratio_name] = (y_train, y_test)

# V·∫Ω bi·ªÉu ƒë·ªì so s√°nh
plot_split_distributions(splits_for_viz, dataset_name, 
                        f'../visualizations/{dataset_name.lower().replace(" ", "_")}/split_distributions.png')

# ==========================================
# 6. L∆ØU K·∫æT QU·∫¢
# ==========================================

print("\n" + "="*50)
print("6. L∆ØU K·∫æT QU·∫¢")
print("="*50)

# T·∫°o th∆∞ m·ª•c output
import os
output_dir = f'../results/{dataset_name.lower().replace(" ", "_")}'
os.makedirs(output_dir, exist_ok=True)

# L∆∞u c√°c split ƒë·ªÉ s·ª≠ d·ª•ng trong Milestone 2
import pickle

# L∆∞u data splits
with open(f'{output_dir}/data_splits.pkl', 'wb') as f:
    pickle.dump({
        'data_splits': data_splits,
        'target_splits': target_splits,
        'original_data': (X, y),
        'dataset_info': {
            'name': dataset_name,
            'shape': data.shape,
            'features': list(X.columns),
            'target': target_col,
            'classes': list(y.unique())
        }
    }, f)

print(f"‚úÖ Data splits saved to {output_dir}/data_splits.pkl")

# T·∫°o summary report
summary_stats = []
for ratio_name in split_ratios.keys():
    y_train, y_test = target_splits[ratio_name]
    summary_stats.append({
        'Split Ratio': ratio_name,
        'Train Size': len(y_train),
        'Test Size': len(y_test),
        'Train Class 0': (y_train == 0).sum() if 0 in y_train.values else 'N/A',
        'Train Class 1': (y_train == 1).sum() if 1 in y_train.values else 'N/A',
        'Test Class 0': (y_test == 0).sum() if 0 in y_test.values else 'N/A', 
        'Test Class 1': (y_test == 1).sum() if 1 in y_test.values else 'N/A'
    })

summary_df = pd.DataFrame(summary_stats)
summary_df.to_csv(f'{output_dir}/split_summary.csv', index=False)
print(f"‚úÖ Summary saved to {output_dir}/split_summary.csv")

# Display summary
print("\nSplit Summary:")
print(summary_df)

print("\n" + "="*50)
print("MILESTONE 1 COMPLETED ‚úÖ")
print("="*50)
print(f"Dataset: {dataset_name}")
print(f"Original samples: {len(data)}")
print(f"Features: {X.shape[1]}")
print(f"Classes: {len(y.unique())}")
print(f"Splits created: {len(split_ratios)}")
print(f"Output directory: {output_dir}")
print("="*50)

# ==========================================
# 7. CHECKLIST CHO COORDINATOR
# ==========================================

print("\nüìã CHECKLIST CHO COORDINATOR:")
print("‚ñ° Dataset ƒë∆∞·ª£c load th√†nh c√¥ng")
print("‚ñ° Missing values ƒë∆∞·ª£c x·ª≠ l√Ω")
print("‚ñ° Categorical features ƒë∆∞·ª£c encode (n·∫øu c√≥)")
print("‚ñ° 4 t·ª∑ l·ªá split ƒë∆∞·ª£c t·∫°o v·ªõi stratified sampling")
print("‚ñ° Class distributions ƒë∆∞·ª£c visualize")
print("‚ñ° K·∫øt qu·∫£ ƒë∆∞·ª£c l∆∞u v√†o ƒë√∫ng th∆∞ m·ª•c")
print("‚ñ° Summary statistics ƒë∆∞·ª£c t·∫°o")
print("\nüîÑ G·ª≠i k·∫øt qu·∫£ n√†y cho Coordinator ƒë·ªÉ review!")