In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784')
X, y = mnist.data.astype('float64'), mnist.target.astype('int')

In [4]:
from sklearn.model_selection import train_test_split
X, y = mnist['data'], mnist['target']

# Convert labels to integers
y = y.astype(int)

# Split the data into training, validation, and test sets
# Use 50,000 instances for training, 10,000 for validation, and 10,000 for testing
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.1, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_openml

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Train Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
y_val_pred_rf = cross_val_predict(rf_clf, X_val_scaled, y_val, cv=3)
accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
print(f"Random Forest Accuracy on Validation Set: {accuracy_rf:.2f}")

# Train Extra-Trees classifier
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
y_val_pred_extra_trees = cross_val_predict(extra_trees_clf, X_val_scaled, y_val, cv=3)
accuracy_extra_trees = accuracy_score(y_val, y_val_pred_extra_trees)
print(f"Extra-Trees Classifier Accuracy on Validation Set: {accuracy_extra_trees:.2f}")

# Train SVM classifier
svm_clf = SVC(gamma='scale')
y_val_pred_svm = cross_val_predict(svm_clf, X_val_scaled, y_val, cv=3)
accuracy_svm = accuracy_score(y_val, y_val_pred_svm)
print(f"SVM Classifier Accuracy on Validation Set: {accuracy_svm:.2f}")

Random Forest Accuracy on Validation Set: 0.92
Extra-Trees Classifier Accuracy on Validation Set: 0.93
SVM Classifier Accuracy on Validation Set: 0.90


In [9]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_openml

# Train individual classifiers
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma='scale')

# Create a Voting Classifier
voting_clf = VotingClassifier(
    estimators=[('rf', rf_clf), ('extra_trees', extra_trees_clf), ('svm', svm_clf)],
    voting='hard'  # Use 'soft' for soft voting
)

# Train the Voting Classifier
voting_clf.fit(X_train_scaled, y_train)

# Evaluate on the validation set
y_val_pred_voting = voting_clf.predict(X_val_scaled)
accuracy_voting = accuracy_score(y_val, y_val_pred_voting)
print(f"Voting Classifier Accuracy on Validation Set: {accuracy_voting:.2f}")

# Evaluate on the test set
y_test_pred_voting = voting_clf.predict(X_test_scaled)
accuracy_test_voting = accuracy_score(y_test, y_test_pred_voting)
print(f"Voting Classifier Accuracy on Test Set: {accuracy_test_voting:.2f}")



Voting Classifier Accuracy on Validation Set: 0.97
Voting Classifier Accuracy on Test Set: 0.97


In [12]:
import numpy as np
_val_pred_rf = cross_val_predict(rf_clf, X_val_scaled, y_val, cv=3)
y_val_pred_extra_trees = cross_val_predict(extra_trees_clf, X_val_scaled, y_val, cv=3)
y_val_pred_svm = cross_val_predict(svm_clf, X_val_scaled, y_val, cv=3)

# Create a new training set with predictions
X_train_with_preds = np.c_[X_train_scaled[:len(y_val_pred_rf)], y_val_pred_rf, y_val_pred_extra_trees, y_val_pred_svm]

# Display the shape of the new training set
print(f"Shape of the new training set: {X_train_with_preds.shape}")

Shape of the new training set: (3500, 787)


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Fit individual classifiers
rf_clf.fit(X_train, y_train)
extra_trees_clf.fit(X_train, y_train)
svm_clf.fit(X_train_scaled, y_train)

# Get predictions from individual classifiers on the validation set
rf_preds = rf_clf.predict(X_val)
extra_trees_preds = extra_trees_clf.predict(X_val)
svm_preds = svm_clf.predict(X_val_scaled)

# Create a new training set with predictions from individual classifiers
X_train_with_preds = np.c_[X_train_scaled, rf_preds, extra_trees_preds, svm_preds]

# Train the ensemble classifier (RandomForestClassifier) on the new training set
ensemble_clf = RandomForestClassifier(n_estimators=100, random_state=42)
ensemble_clf.fit(X_train_with_preds, y_val)

# Make predictions on the test set
X_test_preds = np.c_[X_test_scaled, rf_clf.predict(X_test_scaled), extra_trees_clf.predict(X_test_scaled), svm_clf.predict(X_test_scaled)]
y_test_pred_ensemble = ensemble_clf.predict(X_test_preds)

# Evaluate the accuracy of the ensemble classifier on the test set
accuracy_ensemble = accuracy_score(y_test, y_test_pred_ensemble)
print(f"Accuracy of the ensemble classifier on the test set: {accuracy_ensemble}")


ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 63000 and the array at index 1 has size 3500