# Binary Prediction of Poisonous Mushrooms

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
os.chdir('gradients/binary_prediction_of_poisonous_mushrooms')

In [3]:
# pd.set_option('display.max_rows', 20)
# pd.set_option('display.max_columns', 10)
pd.set_option('display.max_colwidth', 180)
pd.set_option('display.precision', 2)
pd.set_option('display.width', 180)
pd.set_option('display.float_format', '{:,.2f}'.format)


## Exploring the dataset

In [4]:
import pandas as pd

train_df = pd.read_csv('data/train.csv')

In [5]:
train_df.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116945 entries, 0 to 3116944
Data columns (total 22 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   class                 object 
 2   cap-diameter          float64
 3   cap-shape             object 
 4   cap-surface           object 
 5   cap-color             object 
 6   does-bruise-or-bleed  object 
 7   gill-attachment       object 
 8   gill-spacing          object 
 9   gill-color            object 
 10  stem-height           float64
 11  stem-width            float64
 12  stem-root             object 
 13  stem-surface          object 
 14  stem-color            object 
 15  veil-type             object 
 16  veil-color            object 
 17  has-ring              object 
 18  ring-type             object 
 19  spore-print-color     object 
 20  habitat               object 
 21  season                object 
dtypes: float64(3), int64(1), object(18)
memory

In [7]:
train_df.describe()

Unnamed: 0,id,cap-diameter,stem-height,stem-width
count,3116945.0,3116941.0,3116945.0,3116945.0
mean,1558472.0,6.31,6.35,11.15
std,899784.66,4.66,2.7,8.1
min,0.0,0.03,0.0,0.0
25%,779236.0,3.32,4.67,4.97
50%,1558472.0,5.75,5.88,9.65
75%,2337708.0,8.24,7.41,15.63
max,3116944.0,80.67,88.72,102.9


In [8]:
train_df.columns

Index(['id', 'class', 'cap-diameter', 'cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-height',
       'stem-width', 'stem-root', 'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season'],
      dtype='object')

In [9]:
for column in train_df.columns:
    print(f"{column}: {train_df[column].unique()}\n")

id: [      0       1       2 ... 3116942 3116943 3116944]

class: ['e' 'p']

cap-diameter: [ 8.8   4.51  6.94 ... 38.11 55.63 54.07]

cap-shape: ['f' 'x' 'p' 'b' 'o' 'c' 's' 'd' 'e' 'n' nan 'w' 'k' 'l' '19.29' '5 f' 't'
 'g' 'z' 'a' '2.85' '7 x' 'r' 'u' '3.55' 'is s' 'y' '4.22' '3.6' '21.56'
 'i' '6 x' '24.16' '8' 'm' 'ring-type' '10.13' 'is p' '7.43' 'h' '0.82'
 '10.46' '2.77' '2.94' '12.62' '5.15' '19.04' '4.97' '49.21' 'b f' '9.13'
 '1.66' '3.37' '7.21' '3.25' '11.12' '3 x' '4.3' '7.41' '6.21' '8.29'
 '54.78' '20.25' '3.52' '3.04' '2.63' '3.91' '6.44' '8.3' '7.6' '17.44'
 '4.33' '2.82' '6.53' '19.06']

cap-surface: ['s' 'h' 'y' 'l' 't' 'e' 'g' nan 'd' 'i' 'w' 'k' '15.94' 'f' 'n' 'r' 'o'
 'a' 'u' 'z' '2.7' 'does l' '5.07' 'p' 'b' 'm' 'cap-diameter' '1.43' 'x'
 '7.14' 'c' 'is h' 'does t' '0.85' '6.57' '12.79' '6.45' '4.66' '23.18'
 '3.06' '16.39' '4.21' 'veil-color' '11.78' '8.1' 'has-ring' 'does h'
 '1.42' 'class' 'has h' 'does None' '10.83' 'season' '8.96' '14.04' '5.73'
 'is None' 

In [10]:
train_df.isnull().sum()

id                            0
class                         0
cap-diameter                  4
cap-shape                    40
cap-surface              671023
cap-color                    12
does-bruise-or-bleed          8
gill-attachment          523936
gill-spacing            1258435
gill-color                   57
stem-height                   0
stem-width                    0
stem-root               2757023
stem-surface            1980861
stem-color                   38
veil-type               2957493
veil-color              2740947
has-ring                     24
ring-type                128880
spore-print-color       2849682
habitat                      45
season                        0
dtype: int64

In [11]:
train_df['class'].value_counts()

class
p    1705396
e    1411549
Name: count, dtype: int64

## Cleaning the dataset

In [12]:
# Calculate the percentage of missing values per column
missing_percentage = train_df.isnull().mean() * 100

# Find columns with more than 60% missing values
columns_to_drop = missing_percentage[missing_percentage > 60].index

# Drop these columns from the DataFrame
train_df_cleaned = train_df.drop(columns=columns_to_drop)

# Display the columns dropped and the new DataFrame info
print("Columns dropped due to more than 60% missing values:")
print(columns_to_drop.tolist())

Columns dropped due to more than 60% missing values:
['stem-root', 'stem-surface', 'veil-type', 'veil-color', 'spore-print-color']


In [13]:
train_df_cleaned.isnull().sum()

id                            0
class                         0
cap-diameter                  4
cap-shape                    40
cap-surface              671023
cap-color                    12
does-bruise-or-bleed          8
gill-attachment          523936
gill-spacing            1258435
gill-color                   57
stem-height                   0
stem-width                    0
stem-color                   38
has-ring                     24
ring-type                128880
habitat                      45
season                        0
dtype: int64

In [14]:
train_df_cleaned['cap-diameter'].fillna(train_df_cleaned['cap-surface'].mode()[0], inplace=True)

  train_df_cleaned['cap-diameter'].fillna(train_df_cleaned['cap-surface'].mode()[0], inplace=True)


In [15]:
train_df_cleaned['cap-shape'].fillna(train_df_cleaned['cap-shape'].mode()[0], inplace=True)
train_df_cleaned['cap-surface'].fillna(train_df_cleaned['cap-surface'].mode()[0], inplace=True)
train_df_cleaned['cap-color'].fillna(train_df_cleaned['cap-color'].mode()[0], inplace=True)
train_df_cleaned['does-bruise-or-bleed'].fillna(train_df_cleaned['does-bruise-or-bleed'].mode()[0], inplace=True)
train_df_cleaned['gill-attachment'].fillna(train_df_cleaned['gill-attachment'].mode()[0], inplace=True)
train_df_cleaned['gill-color'].fillna(train_df_cleaned['gill-color'].mode()[0], inplace=True)
train_df_cleaned['stem-color'].fillna(train_df_cleaned['stem-color'].mode()[0], inplace=True)
train_df_cleaned['has-ring'].fillna(train_df_cleaned['has-ring'].mode()[0], inplace=True)
train_df_cleaned['ring-type'].fillna(train_df_cleaned['ring-type'].mode()[0], inplace=True)
train_df_cleaned['habitat'].fillna(train_df_cleaned['habitat'].mode()[0], inplace=True)

In [16]:
train_df_cleaned['gill-spacing_missing'] = train_df_cleaned['gill-spacing'].isnull().astype(int)
train_df_cleaned['gill-spacing'].fillna(train_df_cleaned['gill-spacing'].mode()[0], inplace=True)

In [17]:
train_df_cleaned.isnull().sum()

id                      0
class                   0
cap-diameter            0
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-spacing            0
gill-color              0
stem-height             0
stem-width              0
stem-color              0
has-ring                0
ring-type               0
habitat                 0
season                  0
gill-spacing_missing    0
dtype: int64