In [1]:
import pandas as pd

df= pd.read_csv('mushroom.csv')

In [2]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,ruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,e,x,f,n,f,n,f,w,n,b,...,y,w,p,,n,o,p,w,v,
1,p,,y,g,t,,f,c,b,k,...,s,n,c,p,w,n,e,,y,g
2,e,b,y,n,t,n,f,c,,n,...,s,p,,p,w,o,p,b,y,w
3,e,x,g,g,t,n,f,w,b,n,...,s,p,,p,w,n,n,,,d
4,e,,f,,t,n,a,w,n,n,...,k,,w,p,w,,l,w,v,d


In [3]:
df.shape

(25986, 23)

In [4]:
df['class'].value_counts()

class
e    14354
p    11632
Name: count, dtype: int64

In [None]:
print("Missing values before Imputation:")
df.isnull().sum()   

Missing values before Imputation:


class                          0
cap-shape                   3473
cap-surface                 3479
cap-color                   3459
ruises                      3472
odor                        3450
gill-attachment             3481
gill-spacing                3399
gill-size                   3492
gill-color                  3568
stalk-shape                 3474
stalk-root                  3528
stalk-surface-above-ring    3508
stalk-surface-below-ring    3423
stalk-color-above-ring      3573
stalk-color-below-ring      3433
veil-type                   3497
veil-color                  3503
ring-number                 3489
ring-type                   3508
spore-print-color           3493
population                  3511
habitat                     3484
dtype: int64

In [6]:
for col in df.columns:
    if df[col].isnull().sum() > 0:
        mode = df[col].mode()[0]
        df[col].fillna(mode, inplace=True)

print("Missing values after Imputation:")
df.isnull().sum()

Missing values after Imputation:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode, inplace=True)


class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
ruises                      0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [7]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,ruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,e,x,f,n,f,n,f,w,n,b,...,y,w,p,p,n,o,p,w,v,d
1,p,x,y,g,t,n,f,c,b,k,...,s,n,c,p,w,n,e,w,y,g
2,e,b,y,n,t,n,f,c,b,n,...,s,p,w,p,w,o,p,b,y,w
3,e,x,g,g,t,n,f,w,b,n,...,s,p,w,p,w,n,n,w,v,d
4,e,x,f,n,t,n,a,w,n,n,...,k,w,w,p,w,o,l,w,v,d


In [8]:
X = df.drop('class', axis=1)
y = df['class']

In [9]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X).toarray()

print(X_encoded.shape)

(25986, 117)


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [12]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Support Vector Machine': SVC(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}

In [13]:
results = []

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    results.append((name, acc, prec, rec, f1))

    print(f"/n{name} Results:")
    print(classification_report(y_test, y_pred))

results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])


Training Logistic Regression...
/nLogistic Regression Results:
              precision    recall  f1-score   support

           e       0.74      0.80      0.77      2873
           p       0.73      0.66      0.69      2325

    accuracy                           0.74      5198
   macro avg       0.74      0.73      0.73      5198
weighted avg       0.74      0.74      0.73      5198

Training Decision Tree...
/nDecision Tree Results:
              precision    recall  f1-score   support

           e       0.68      0.67      0.68      2873
           p       0.60      0.62      0.61      2325

    accuracy                           0.65      5198
   macro avg       0.64      0.64      0.64      5198
weighted avg       0.65      0.65      0.65      5198

Training Support Vector Machine...
/nSupport Vector Machine Results:
              precision    recall  f1-score   support

           e       0.74      0.84      0.78      2873
           p       0.76      0.63      0.69      2325


In [14]:
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.736629,0.735955,0.736629,0.734936
1,Decision Tree,0.646595,0.647489,0.646595,0.646964
2,Support Vector Machine,0.746441,0.747931,0.746441,0.742997
3,Random Forest,0.738746,0.738579,0.738746,0.736401


In [None]:
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out())


In [None]:
y_df = pd.DataFrame(y)  

In [None]:
df = pd.concat([X_encoded_df, y_df], axis=1)    

In [20]:
df.to_csv('mushroom_encoded.csv', index=False)