In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import sys
import os
from itertools import product

import xgboost as xgb
sys.path.append(os.path.abspath(".."))
from src.train_xgboost import decode_labels, XGBoost_Classifier

In [25]:
path = '../data/raw/weather_classification_data.csv'
df = pd.read_csv(path)

# Đếm số giá trị thiếu mỗi cột
missing_counts = df.isnull().sum()

print("Số giá trị thiếu theo từng cột:")
print(missing_counts)

Số giá trị thiếu theo từng cột:
Temperature             0
Humidity                0
Wind Speed              0
Precipitation (%)       0
Cloud Cover             0
Atmospheric Pressure    0
UV Index                0
Season                  0
Visibility (km)         0
Location                0
Weather Type            0
dtype: int64


In [None]:
# # Giả sử model dự đoán ra:
# y_pred = [0, 2, 1]
# # decode lại kết quả gốc
# decoded = decode_labels(y_pred, 'Weather Type', label_encoders)
# print("\nGiải mã dự đoán:", decoded)

In [42]:
# Chuẩn bị dữ liệu
X = df.drop(columns=['Weather Type'])
y = LabelEncoder().fit_transform(df['Weather Type'].values)

# Xác định các cột categorical theo tên cột
categorical_features = ['Season', 'Location', 'Cloud Cover']
categorical_cols = [i for i, feature in enumerate(X.columns) if feature in categorical_features]

# Chia train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y, test_size=0.2, random_state=36, stratify=y
)


In [28]:
# tạo param grid để tìm tham số tối ưu
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.05, 0.1, 0.3],
    'gamma': [0, 0.1, 0.5, 1],
    'reg_lambda': [1.0, 2.0, 5.0],
    'n_estimators': [50]
}

best_acc = -1
best_params = None
count = 1
param_combinations = list(product(
    param_grid['max_depth'],
    param_grid['learning_rate'],
    param_grid['gamma'],
    param_grid['reg_lambda'],
    param_grid['n_estimators']
))
combination_count = len(param_combinations)

for count, (max_depth, lr, gamma, reg_lambda, n_estimators) in enumerate(param_combinations, start = 1):
    print(f"Testing {count}/{combination_count} combination: depth={max_depth}, lr={lr}, gamma={gamma}, lambda={reg_lambda}")
    model = XGBoost_Classifier(
        n_classes=len(set(y)),
        n_estimators=n_estimators,
        learning_rate=lr,
        max_depth=max_depth,
        reg_lambda=reg_lambda,
        gamma=gamma,
        categorical_cols=categorical_cols
    )
    model.fit(X_train, y_train, categorical_cols=categorical_cols)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(" → accuracy:", acc)

    if acc > best_acc:
        best_acc = acc
        best_params = (max_depth, lr, gamma, reg_lambda)

print("\n✅ Best params:", best_params)
print("✅ Best accuracy:", best_acc)

Testing 1/108 combination: depth=3, lr=0.05, gamma=0, lambda=1.0
Building tree: 1/50
Building tree: 2/50
Building tree: 3/50
Building tree: 4/50
Building tree: 5/50
Building tree: 6/50
Building tree: 7/50
Building tree: 8/50
Building tree: 9/50
Building tree: 10/50
Building tree: 11/50
Building tree: 12/50
Building tree: 13/50
Building tree: 14/50
Building tree: 15/50
Building tree: 16/50
Building tree: 17/50
Building tree: 18/50
Building tree: 19/50
Building tree: 20/50
Building tree: 21/50
Building tree: 22/50
Building tree: 23/50
Building tree: 24/50
Building tree: 25/50
Building tree: 26/50
Building tree: 27/50
Building tree: 28/50
Building tree: 29/50
Building tree: 30/50
Building tree: 31/50
Building tree: 32/50
Building tree: 33/50
Building tree: 34/50
Building tree: 35/50
Building tree: 36/50
Building tree: 37/50
Building tree: 38/50
Building tree: 39/50
Building tree: 40/50
Building tree: 41/50
Building tree: 42/50
Building tree: 43/50
Building tree: 44/50
Building tree: 45/50

In [None]:
model = XGBoost_Classifier(
    n_classes=len(np.unique(y)),
    n_estimators=50,
    learning_rate=best_params[1],
    max_depth=best_params[0],
    min_samples_split=2,
    reg_lambda=best_params[3],
    gamma=best_params[2],
    categorical_cols=categorical_cols
)
model.fit(X_train, y_train, categorical_cols=categorical_cols)


Building tree: 1/50
Building tree: 2/50
Building tree: 3/50
Building tree: 4/50
Building tree: 5/50
Building tree: 6/50
Building tree: 7/50
Building tree: 8/50
Building tree: 9/50
Building tree: 10/50
Building tree: 11/50
Building tree: 12/50
Building tree: 13/50
Building tree: 14/50
Building tree: 15/50
Building tree: 16/50
Building tree: 17/50
Building tree: 18/50
Building tree: 19/50
Building tree: 20/50
Building tree: 21/50
Building tree: 22/50
Building tree: 23/50
Building tree: 24/50
Building tree: 25/50
Building tree: 26/50
Building tree: 27/50
Building tree: 28/50
Building tree: 29/50
Building tree: 30/50
Building tree: 31/50
Building tree: 32/50
Building tree: 33/50
Building tree: 34/50
Building tree: 35/50
Building tree: 36/50
Building tree: 37/50
Building tree: 38/50
Building tree: 39/50
Building tree: 40/50
Building tree: 41/50
Building tree: 42/50
Building tree: 43/50
Building tree: 44/50
Building tree: 45/50
Building tree: 46/50
Building tree: 47/50
Building tree: 48/50
B

In [44]:
# Dự đoán nhãn
y_pred = model.predict(X_test)

# Dự đoán xác suất mỗi class
y_proba = model.predict_proba(X_test)

# Accuracy
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:", acc)


Test Accuracy: 0.9125


In [None]:
y = LabelEncoder().fit_transform(df['Weather Type'])

# Chia train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Chuyển cột categorical sang category
categorical_features = ['Season', 'Location', 'Cloud Cover']
for col in categorical_features:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

# Cấu hình params
params = {
    'objective': 'multi:softmax',  # multi-class classification
    'num_class': len(set(y)),
    'eval_metric': 'mlogloss',
    'learning_rate': 0.3,
    'max_depth': 5,
    'reg_lambda': 1.0,  # L2 regularization (lambda)
    'gamma': 0.5         # minimum loss reduction to make a split
}

num_round = 50

bst = xgb.train(params, dtrain, num_round)

# Dự đoán
y_pred = bst.predict(dtest)
acc = accuracy_score(y_test, y_pred)
print("XGBoost chính thức accuracy:", acc)

XGBoost chính thức accuracy: 0.9162878787878788
