In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression

# 1. Load dữ liệu
df = pd.read_csv("diabetes_012_health_indicators_BRFSS2015.csv")

# 2. Tiền xử lý dữ liệu
df = df.dropna()
df['Diabetes_012'] = df['Diabetes_012'].astype('category')

# 3. Chia X, y
X = df.drop("Diabetes_012", axis=1)
y = df["Diabetes_012"]

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 5. Chuẩn hóa dữ liệu
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6. SMOTE để cân bằng dữ liệu
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

# 7. Mô hình XGBoost sử dụng GPU
xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric="mlogloss",
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    gpu_id=0,
    random_state=42
)

# 8. Các mô hình khác
rf = RandomForestClassifier(random_state=42)
lr = LogisticRegression(solver='liblinear', max_iter=1000)

# 9. Ensemble model
ensemble = VotingClassifier(estimators=[
    ('xgb', xgb),
    ('rf', rf),
    ('lr', lr)
], voting='soft')

# 10. GridSearchCV để tuning
param_grid = {
    'xgb__n_estimators': [100, 200],
    'xgb__max_depth': [3, 5, 7],
    'xgb__learning_rate': [0.05, 0.1],
    'rf__n_estimators': [100],
    'rf__max_depth': [10],
    'lr__C': [1.0]
}

grid = GridSearchCV(ensemble, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid.fit(X_train_bal, y_train_bal)

# 11. Dự đoán
y_pred = grid.predict(X_test_scaled)

# 12. Kết quả
print("=== Classification Report ===")
print(classification_report(y_test, y_pred))
print("Best Params:", grid.best_params_)

# 13. Dự đoán mẫu mới (nếu cần)
sample = np.array([[1, 1, 1, 30, 1, 0, 0, 1, 1, 1]])
sample_scaled = scaler.transform(sample)
print("Prediction for new sample:", grid.predict(sample_scaled))


[WinError 2] The system cannot find the file specified
  File "c:\Users\ndat4\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\ndat4\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ndat4\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\ndat4\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fo

=== Classification Report ===
              precision    recall  f1-score   support

         0.0       0.91      0.87      0.89     42741
         1.0       0.06      0.00      0.00       926
         2.0       0.40      0.54      0.46      7069

    accuracy                           0.81     50736
   macro avg       0.46      0.47      0.45     50736
weighted avg       0.82      0.81      0.81     50736

Best Params: {'lr__C': 1.0, 'rf__max_depth': 10, 'rf__n_estimators': 100, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 7, 'xgb__n_estimators': 200}




ValueError: X has 10 features, but StandardScaler is expecting 21 features as input.

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("alexteboul/diabetes-health-indicators-dataset")

print("Path to dataset files:", path)


ModuleNotFoundError: No module named 'kagglehub'