In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [2]:
mushrooms = pd.read_csv("secondary_data.csv", sep = ";")

In [3]:
# Shape of the dataset, response variable and feature names, data type of each column
mushrooms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61069 entries, 0 to 61068
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   class                 61069 non-null  object 
 1   cap-diameter          61069 non-null  float64
 2   cap-shape             61069 non-null  object 
 3   cap-surface           46949 non-null  object 
 4   cap-color             61069 non-null  object 
 5   does-bruise-or-bleed  61069 non-null  object 
 6   gill-attachment       51185 non-null  object 
 7   gill-spacing          36006 non-null  object 
 8   gill-color            61069 non-null  object 
 9   stem-height           61069 non-null  float64
 10  stem-width            61069 non-null  float64
 11  stem-root             9531 non-null   object 
 12  stem-surface          22945 non-null  object 
 13  stem-color            61069 non-null  object 
 14  veil-type             3177 non-null   object 
 15  veil-color         

In [4]:
# Summary statistics for the numeric features
mushrooms.describe()

Unnamed: 0,cap-diameter,stem-height,stem-width
count,61069.0,61069.0,61069.0
mean,6.733854,6.581538,12.14941
std,5.264845,3.370017,10.035955
min,0.38,0.0,0.0
25%,3.48,4.64,5.21
50%,5.86,5.95,10.19
75%,8.54,7.74,16.57
max,62.34,33.92,103.91


In [5]:
# Count the number of missing values within each column
mushrooms.isna().sum()

class                       0
cap-diameter                0
cap-shape                   0
cap-surface             14120
cap-color                   0
does-bruise-or-bleed        0
gill-attachment          9884
gill-spacing            25063
gill-color                  0
stem-height                 0
stem-width                  0
stem-root               51538
stem-surface            38124
stem-color                  0
veil-type               57892
veil-color              53656
has-ring                    0
ring-type                2471
spore-print-color       54715
habitat                     0
season                      0
dtype: int64

In [6]:
# Count the number of observations of poisonous and edible mushrooms
mushrooms["class"].value_counts()

class
p    33888
e    27181
Name: count, dtype: int64

In [7]:
# Label encoding the response variable and categorical features
le = LabelEncoder()
mask = mushrooms.isna()
cols_to_encode = mushrooms.columns.drop(["cap-diameter", "stem-width", "stem-height"])

for col in cols_to_encode:
    mushrooms[col] = le.fit_transform(mushrooms[col])

mushrooms = mushrooms.where(~ mask, np.nan)

In [8]:
# Z-transform the numeric features
quantitative_vars = mushrooms[["cap-diameter", "stem-width", "stem-height"]]
mushrooms.drop(["cap-diameter", "stem-width", "stem-height"], axis=1, inplace=True)
sc = StandardScaler()
sc.fit(quantitative_vars)
xscaled=sc.transform(quantitative_vars)

quant_scaled=pd.DataFrame(data=xscaled,columns=mushrooms.columns[[1,9,10]])
mushrooms = pd.concat([quant_scaled, mushrooms], axis = 1)
class_col = mushrooms.pop("class")
mushrooms.insert(0, "class", class_col)
mushrooms

Unnamed: 0,class,cap-shape,stem-surface,stem-color,cap-shape.1,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,...,stem-root,stem-surface.1,stem-color.1,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,1,1.619462,0.492293,3.076705,6,2.0,6,0,2.0,,...,4.0,7.0,11,0.0,4.0,1,2.0,,0,3
1,1,1.873982,0.601900,3.385311,6,2.0,6,0,2.0,,...,4.0,7.0,11,0.0,4.0,1,2.0,,0,2
2,1,1.393432,0.557061,3.328931,6,2.0,6,0,2.0,,...,4.0,7.0,11,0.0,4.0,1,2.0,,0,3
3,1,1.412426,0.381690,2.726555,2,3.0,1,0,2.0,,...,4.0,7.0,11,0.0,4.0,1,5.0,,0,3
4,1,1.501699,0.503254,2.952075,6,3.0,6,0,2.0,,...,4.0,7.0,11,0.0,4.0,1,5.0,,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61064,1,-1.054903,-0.590822,-0.786809,5,7.0,11,0,3.0,2.0,...,,,12,,,0,1.0,,0,0
61065,1,-1.037808,-0.669539,-1.009362,2,7.0,11,0,3.0,2.0,...,,,12,,,0,1.0,,0,0
61066,1,-1.037808,-0.575875,-0.807581,5,7.0,11,0,3.0,2.0,...,,,12,,,0,1.0,,0,2
61067,1,-1.043506,-0.668543,-0.896602,2,7.0,11,0,3.0,2.0,...,,,12,,,0,1.0,,0,2


In [9]:
mushrooms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61069 entries, 0 to 61068
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   class                 61069 non-null  int64  
 1   cap-shape             61069 non-null  float64
 2   stem-surface          61069 non-null  float64
 3   stem-color            61069 non-null  float64
 4   cap-shape             61069 non-null  int64  
 5   cap-surface           46949 non-null  float64
 6   cap-color             61069 non-null  int64  
 7   does-bruise-or-bleed  61069 non-null  int64  
 8   gill-attachment       51185 non-null  float64
 9   gill-spacing          36006 non-null  float64
 10  gill-color            61069 non-null  int64  
 11  stem-root             9531 non-null   float64
 12  stem-surface          22945 non-null  float64
 13  stem-color            61069 non-null  int64  
 14  veil-type             3177 non-null   float64
 15  veil-color         