In [14]:
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [15]:
# Prepare your dataset (X, y)
df = pd.read_csv('../Jobsheet6_EnsembleLearning/Data/mushrooms.csv')

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [16]:
# Cek kolom null
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [17]:
X = df.drop(columns=['class'])  # Semua kolom kecuali 'class'
y = df['class']                  # Hanya kolom 'class'

In [18]:
ec_y = LabelEncoder()
y_encoded = ec_y.fit_transform(y)

for column in X.columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])

print(f"Shape of X: {X.shape}")
print(f"Encoded labels: {y_encoded}")

# Cek jumlah fitur dan instance
print(X.shape)

# Cek label
print(y)



Shape of X: (8124, 22)
Encoded labels: [1 0 0 ... 0 1 0]
(8124, 22)
0       p
1       e
2       e
3       p
4       e
       ..
8119    e
8120    e
8121    e
8122    p
8123    e
Name: class, Length: 8124, dtype: object


In [19]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# 1. Random Forest
rf = RandomForestClassifier(n_estimators=50, random_state=42)
rf.fit(X_train, y_train)
rf_accuracy = rf.score(X_test, y_test)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")

# 2. AdaBoost
ada = AdaBoostClassifier(n_estimators=50, random_state=42)
ada.fit(X_train, y_train)
ada_accuracy = ada.score(X_test, y_test)
print(f"AdaBoost Accuracy: {ada_accuracy:.4f}")

# 3. Stacking Classifier
layer_one_estimators = [
    ('rf_1', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('knn_1', KNeighborsClassifier(n_neighbors=5))
]
layer_two_estimators = [
    ('dt_2', DecisionTreeClassifier()),
    ('rf_2', RandomForestClassifier(n_estimators=50, random_state=42)),
]
layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=LogisticRegression())
clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two)

# Fit the Stacking Classifier and calculate accuracy
clf.fit(X_train, y_train)
stacking_accuracy = clf.score(X_test, y_test)
print(f"Stacking Classifier Accuracy: {stacking_accuracy:.4f}")

# Calculate the differences in accuracy
rf_vs_stacking_diff = stacking_accuracy - rf_accuracy
ada_vs_stacking_diff = stacking_accuracy - ada_accuracy

print(f"Difference in accuracy (Stacking - Random Forest): {rf_vs_stacking_diff:.4f}")
print(f"Difference in accuracy (Stacking - AdaBoost): {ada_vs_stacking_diff:.4f}")

Random Forest Accuracy: 1.0000




AdaBoost Accuracy: 1.0000
Stacking Classifier Accuracy: 1.0000
Difference in accuracy (Stacking - Random Forest): 0.0000
Difference in accuracy (Stacking - AdaBoost): 0.0000
