# Phase 3: Model Building
# Xây dựng Mô hình

## Mục tiêu / Objectives:
1. Build 5+ classification models
2. Build 5+ regression models
3. Define hyperparameter grids
4. Prepare for training and evaluation

---

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings
import sys
import os

# Scikit-learn
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# XGBoost
import xgboost as xgb

# TensorFlow/Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Add src to path
sys.path.append('../src')
from data_split import load_split_data, split_features_target

warnings.filterwarnings('ignore')

print('Libraries imported successfully!')
print(f'TensorFlow version: {tf.__version__}')
print(f'XGBoost version: {xgb.__version__}')

## 1. Load Train and Test Data

In [None]:
# Load split data
train_df, test_df, encoders = load_split_data('../data/processed')

print(f'Train set: {train_df.shape}')
print(f'Test set: {test_df.shape}')

## 2. Classification Models

### 2.1 Logistic Regression (Baseline)

In [None]:
# TODO: Implement Logistic Regression
# Define hyperparameter grid
lr_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

print('Logistic Regression model configured')
print(f'Hyperparameter grid: {lr_params}')

### 2.2 Random Forest Classifier

In [None]:
# TODO: Implement Random Forest Classifier
rf_clf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

print('Random Forest Classifier configured')
print(f'Hyperparameter grid: {rf_clf_params}')

### 2.3 XGBoost Classifier

In [None]:
# TODO: Implement XGBoost Classifier
xgb_clf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.7, 0.8, 0.9, 1.0]
}

print('XGBoost Classifier configured')
print(f'Hyperparameter grid: {xgb_clf_params}')

### 2.4 Support Vector Machine

In [None]:
# TODO: Implement SVM
svm_params = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

print('SVM configured')
print(f'Hyperparameter grid: {svm_params}')

### 2.5 Neural Network Classifier

In [None]:
# TODO: Implement Neural Network
def build_classification_nn(input_dim, num_classes):
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_dim=input_dim),
        layers.Dropout(0.3),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(32, activation='relu'),
        layers.Dense(num_classes, activation='softmax')
    ])
    
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

print('Neural Network Classifier architecture defined')

## 3. Regression Models

### 3.1 Linear Regression (Baseline)

In [None]:
# TODO: Implement Linear Regression
print('Linear Regression model configured')

### 3.2 Random Forest Regressor

In [None]:
# TODO: Implement Random Forest Regressor
rf_reg_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

print('Random Forest Regressor configured')
print(f'Hyperparameter grid: {rf_reg_params}')

### 3.3 XGBoost Regressor

In [None]:
# TODO: Implement XGBoost Regressor
xgb_reg_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.7, 0.8, 0.9]
}

print('XGBoost Regressor configured')
print(f'Hyperparameter grid: {xgb_reg_params}')

### 3.4 Gradient Boosting Regressor

In [None]:
# TODO: Implement Gradient Boosting
gb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9]
}

print('Gradient Boosting Regressor configured')
print(f'Hyperparameter grid: {gb_params}')

### 3.5 Neural Network Regressor

In [None]:
# TODO: Implement Neural Network Regressor
def build_regression_nn(input_dim):
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_dim=input_dim),
        layers.Dropout(0.3),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(32, activation='relu'),
        layers.Dense(1, activation='linear')
    ])
    
    model.compile(
        optimizer='adam',
        loss='mse',
        metrics=['mae']
    )
    
    return model

print('Neural Network Regressor architecture defined')

## 4. Summary

In [None]:
print('='*80)
print('MODEL BUILDING SUMMARY')
print('='*80)

print('\nClassification Models:')
print('1. Logistic Regression (Baseline)')
print('2. Random Forest Classifier')
print('3. XGBoost Classifier')
print('4. Support Vector Machine')
print('5. Neural Network Classifier')

print('\nRegression Models:')
print('1. Linear Regression (Baseline)')
print('2. Random Forest Regressor')
print('3. XGBoost Regressor')
print('4. Gradient Boosting Regressor')
print('5. Neural Network Regressor')

print('\n✓ MODEL BUILDING PHASE COMPLETE!')
print('Next: Proceed to Phase 4 - Training & Evaluation')