In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.datasets import make_blobs
import numpy as np

# Load the dataset
iris = load_iris()
X = iris.data
y = iris.target == 1
# Ensemble learning
# Three model in ensemble learning
log_clf = LogisticRegression()
svm_clf = SVC()
tree_clf = DecisionTreeClassifier(max_depth=3)

voting_clf = VotingClassifier(
  estimators=[('lr', log_clf), ('svc', svm_clf), ('tree_clf', tree_clf)],
  voting='hard'
)

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=5, random_state=1)

In [7]:
# Đánh giá mô hình trên từng mô hình đơn lẻ
scores = cross_val_score(log_clf, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('Logistic Regression Mean Accuracy: {:.03f}'.format(np.mean(scores)))
scores = cross_val_score(svm_clf, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('SVM Mean Mean Accuracy: {:.03f}'.format(np.mean(scores)))
scores = cross_val_score(tree_clf, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('DecisionTree Classifier Mean Accuracy: {:.03f}'.format(np.mean(scores)))

Logistic Regression Mean Accuracy: 0.713
SVM Mean Mean Accuracy: 0.941
DecisionTree Classifier Mean Accuracy: 0.943


In [8]:
# Đánh giá mô hình trên mô hình kết hợp
scores = cross_val_score(voting_clf, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('Voting Classifier Mean Accuracy: {:.03f}'.format(np.mean(scores)))

Voting Classifier Mean Accuracy: 0.949


In [9]:
# Boostrapping
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
  DecisionTreeClassifier(),
  n_estimators=200,
  max_samples=100,
  bootstrap=True,
  n_jobs=-1
)

bag_clf.fit(X, y)

scores = cross_val_score(bag_clf, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('Logistic Regression Mean Accuracy: {:.03f}'.format(np.mean(scores)))

Logistic Regression Mean Accuracy: 0.945


In [15]:
# random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Huấn luyện mô hình trên tập train
rdf_clf = RandomForestClassifier(
    max_depth = 3,
    max_leaf_nodes = 16,
    min_samples_split = 10,
    min_samples_leaf = 10
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

rdf_clf.fit(X_train, y_train)

# Dự báo trên tập test
y_pred = rdf_clf.predict(X_test)
scores = accuracy_score(y_pred, y_test)
print('RandomForest Accuracy: {:.03f}'.format(scores))

RandomForest Accuracy: 0.956
