In [1]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## EDA

In [2]:
data = pd.read_csv("dataset_mushrooms.csv")
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7921 entries, 0 to 7920
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     7921 non-null   object
 1   cap-shape                 7921 non-null   object
 2   cap-surface               7921 non-null   object
 3   cap-color                 7921 non-null   object
 4   bruises                   7921 non-null   object
 5   odor                      7921 non-null   object
 6   gill-attachment           7921 non-null   object
 7   gill-spacing              7921 non-null   object
 8   gill-size                 7921 non-null   object
 9   gill-color                7921 non-null   object
 10  stalk-shape               7921 non-null   object
 11  stalk-root                7921 non-null   object
 12  stalk-surface-above-ring  7921 non-null   object
 13  stalk-surface-below-ring  7921 non-null   object
 14  stalk-color-above-ring  

In [4]:
data.isnull().any() # check if there is any missing data

class                       False
cap-shape                   False
cap-surface                 False
cap-color                   False
bruises                     False
odor                        False
gill-attachment             False
gill-spacing                False
gill-size                   False
gill-color                  False
stalk-shape                 False
stalk-root                  False
stalk-surface-above-ring    False
stalk-surface-below-ring    False
stalk-color-above-ring      False
stalk-color-below-ring      False
veil-type                   False
veil-color                  False
ring-number                 False
ring-type                   False
spore-print-color           False
population                  False
habitat                     False
dtype: bool

In [5]:
# check for possible '?' in data
test = data.replace("?", np.NaN)
test.isnull().any()

# False -> no '?' found

class                       False
cap-shape                   False
cap-surface                 False
cap-color                   False
bruises                     False
odor                        False
gill-attachment             False
gill-spacing                False
gill-size                   False
gill-color                  False
stalk-shape                 False
stalk-root                   True
stalk-surface-above-ring    False
stalk-surface-below-ring    False
stalk-color-above-ring      False
stalk-color-below-ring      False
veil-type                   False
veil-color                  False
ring-number                 False
ring-type                   False
spore-print-color           False
population                  False
habitat                     False
dtype: bool

## Replace data using column description

In [6]:
data_definitions = pd.read_csv("columns_definitions.csv")
data_definitions

Unnamed: 0,column_name,defs
0,class,"{""e"": ""edible"", ""p"": ""poisonous""}"
1,cap-shape,"{""b"": ""bell"", ""c"": ""conical"", ""x"": ""convex"", ""..."
2,cap-surface,"{""f"": ""fibrous"", ""g"": ""grooves"", ""y"": ""scaly"",..."
3,cap-color,"{""n"": ""brown"", ""b"": ""buff"", ""c"": ""cinnamon"", ""..."
4,bruises,"{""t"": ""bruises"", ""f"": ""no""}"
5,odor,"{""a"": ""almond"", ""l"": ""anise"", ""c"": ""creosote"",..."
6,gill-attachment,"{""a"": ""attached"", ""d"": ""descending"", ""f"": ""fre..."
7,gill-spacing,"{""c"": ""close"", ""w"": ""crowded"", ""d"": ""distant""}"
8,gill-size,"{""b"": ""broad"", ""n"": ""narrow""}"
9,gill-color,"{""k"": ""black"", ""n"": ""brown"", ""b"": ""buff"", ""h"":..."


In [7]:
data_definitions.set_index('column_name')

Unnamed: 0_level_0,defs
column_name,Unnamed: 1_level_1
class,"{""e"": ""edible"", ""p"": ""poisonous""}"
cap-shape,"{""b"": ""bell"", ""c"": ""conical"", ""x"": ""convex"", ""..."
cap-surface,"{""f"": ""fibrous"", ""g"": ""grooves"", ""y"": ""scaly"",..."
cap-color,"{""n"": ""brown"", ""b"": ""buff"", ""c"": ""cinnamon"", ""..."
bruises,"{""t"": ""bruises"", ""f"": ""no""}"
odor,"{""a"": ""almond"", ""l"": ""anise"", ""c"": ""creosote"",..."
gill-attachment,"{""a"": ""attached"", ""d"": ""descending"", ""f"": ""fre..."
gill-spacing,"{""c"": ""close"", ""w"": ""crowded"", ""d"": ""distant""}"
gill-size,"{""b"": ""broad"", ""n"": ""narrow""}"
gill-color,"{""k"": ""black"", ""n"": ""brown"", ""b"": ""buff"", ""h"":..."


In [8]:
columns_dictionaries_list = []

for column_name in data_definitions.index:
    s = data_definitions.loc[column_name]['defs']
    columns_dictionaries_list.append(eval(s))

columns_dictionaries_list[:5]

[{'e': 'edible', 'p': 'poisonous'},
 {'b': 'bell',
  'c': 'conical',
  'x': 'convex',
  'f': 'flat',
  'k': 'knobbed',
  's': 'sunken'},
 {'f': 'fibrous', 'g': 'grooves', 'y': 'scaly', 's': 'smooth'},
 {'n': 'brown',
  'b': 'buff',
  'c': 'cinnamon',
  'g': 'gray',
  'r': 'green',
  'p': 'pink',
  'u': 'purple',
  'e': 'red',
  'w': 'white',
  'y': 'yellow'},
 {'t': 'bruises', 'f': 'no'}]

In [9]:
features_data = list(data.columns)
print(features_data)
print("# of features: ",len(features_data))

['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
# of features:  23


In [10]:
features_data_definitions = list(data_definitions['column_name'])
print(features_data_definitions)
print("# of features: ",len(features_data_definitions))

['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
# of features:  23


In [11]:
set(features_data).symmetric_difference(set(features_data_definitions))

# returned an empty set, meaning the column names are identical in both dataset and column definition file

set()

In [12]:
data_copy = data
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [13]:
data_copy['class'].replace(columns_dictionaries_list[0],inplace=True)

In [14]:
data_copy.head() # check if raplacement was successful

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,poisonous,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,edible,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,edible,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,poisonous,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,edible,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


Now iterate through each column

In [15]:
data_new = data # raw dataframe each time the cell is running
i = 0
for column_name in features_data:
    data_new[column_name].replace(columns_dictionaries_list[i], inplace=True)
    i+=1

In [16]:
data_new.sample(5)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
5372,poisonous,convex,scaly,brown,no,fishy,free,close,narrow,buff,...,silky,pink,pink,partial,white,one,evanescent,white,several,leaves
7871,poisonous,knobbed,scaly,red,no,foul,free,close,narrow,buff,...,silky,white,white,partial,white,one,evanescent,white,several,paths
241,edible,bell,scaly,yellow,bruises,anise,free,close,broad,white,...,smooth,white,white,partial,white,one,pendant,brown,scattered,meadows
398,edible,convex,scaly,brown,bruises,anise,free,close,broad,brown,...,scaly,white,white,partial,white,one,pendant,brown,scattered,paths
3013,edible,flat,fibrous,red,bruises,none,free,close,broad,brown,...,smooth,gray,gray,partial,white,one,pendant,brown,solitary,woods


## Clean data in submission dataset

In [17]:
data_submission = pd.read_csv("submission_mushrooms.csv")
data_submission.head()

Unnamed: 0.1,Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,x,y,y,f,f,f,c,b,h,...,k,n,b,p,w,o,l,h,v,d
1,1,b,s,y,t,l,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,2,x,f,n,t,n,f,c,b,u,...,s,p,w,p,w,o,p,n,y,d
3,3,k,s,n,f,y,f,c,n,b,...,s,w,p,p,w,o,e,w,v,p
4,4,f,s,e,f,f,f,c,n,b,...,k,w,p,p,w,o,e,w,v,d


In [18]:
data_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203 entries, 0 to 202
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Unnamed: 0                203 non-null    int64 
 1   cap-shape                 203 non-null    object
 2   cap-surface               203 non-null    object
 3   cap-color                 203 non-null    object
 4   bruises                   203 non-null    object
 5   odor                      203 non-null    object
 6   gill-attachment           203 non-null    object
 7   gill-spacing              203 non-null    object
 8   gill-size                 203 non-null    object
 9   gill-color                203 non-null    object
 10  stalk-shape               203 non-null    object
 11  stalk-root                203 non-null    object
 12  stalk-surface-above-ring  203 non-null    object
 13  stalk-surface-below-ring  203 non-null    object
 14  stalk-color-above-ring    

In [19]:
data_submission.drop(['Unnamed: 0'],axis=1,inplace=True)
data_submission.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,y,y,f,f,f,c,b,h,e,...,k,n,b,p,w,o,l,h,v,d
1,b,s,y,t,l,f,c,b,k,e,...,s,w,w,p,w,o,p,n,n,g
2,x,f,n,t,n,f,c,b,u,t,...,s,p,w,p,w,o,p,n,y,d
3,k,s,n,f,y,f,c,n,b,t,...,s,w,p,p,w,o,e,w,v,p
4,f,s,e,f,f,f,c,n,b,t,...,k,w,p,p,w,o,e,w,v,d


In [20]:
data_submission.notnull().any()

cap-shape                   True
cap-surface                 True
cap-color                   True
bruises                     True
odor                        True
gill-attachment             True
gill-spacing                True
gill-size                   True
gill-color                  True
stalk-shape                 True
stalk-root                  True
stalk-surface-above-ring    True
stalk-surface-below-ring    True
stalk-color-above-ring      True
stalk-color-below-ring      True
veil-type                   True
veil-color                  True
ring-number                 True
ring-type                   True
spore-print-color           True
population                  True
habitat                     True
dtype: bool

In [21]:
test = data_submission.replace("?", np.NaN) # there are some '?' in the set
test[:10]

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,y,y,f,f,f,c,b,h,e,...,k,n,b,p,w,o,l,h,v,d
1,b,s,y,t,l,f,c,b,k,e,...,s,w,w,p,w,o,p,n,n,g
2,x,f,n,t,n,f,c,b,u,t,...,s,p,w,p,w,o,p,n,y,d
3,k,s,n,f,y,f,c,n,b,t,...,s,w,p,p,w,o,e,w,v,p
4,f,s,e,f,f,f,c,n,b,t,...,k,w,p,p,w,o,e,w,v,d
5,f,y,e,t,n,f,c,b,p,t,...,s,g,g,p,w,o,p,k,y,d
6,f,s,g,f,n,f,w,b,n,t,...,f,w,w,p,w,o,e,n,a,g
7,f,y,n,t,l,f,c,b,n,e,...,y,w,w,p,w,o,p,n,s,g
8,x,s,w,f,n,f,w,b,p,t,...,s,w,w,p,w,o,e,n,a,g
9,x,s,n,f,n,f,w,b,k,t,...,s,w,w,p,w,o,e,n,s,g


In [22]:
test.isnull().any()

cap-shape                   False
cap-surface                 False
cap-color                   False
bruises                     False
odor                        False
gill-attachment             False
gill-spacing                False
gill-size                   False
gill-color                  False
stalk-shape                 False
stalk-root                   True
stalk-surface-above-ring    False
stalk-surface-below-ring    False
stalk-color-above-ring      False
stalk-color-below-ring      False
veil-type                   False
veil-color                  False
ring-number                 False
ring-type                   False
spore-print-color           False
population                  False
habitat                     False
dtype: bool

In [23]:
test['stalk-root'].value_counts(dropna=False) 
# show how many NaN there are. 1/3 of the column is empty

# in the column definitions it is know that there are missing values
# so first we can change the data definition

b      89
NaN    61
e      28
c      21
r       4
Name: stalk-root, dtype: int64

In [24]:
datadef_new = data_submission # raw dataframe each time the cell is running

i = 1
for column_name in features_data[1:]: # [1:] -> we don't need 'class' column
    datadef_new[column_name].replace(columns_dictionaries_list[i], inplace=True)
    i+=1


In [25]:
datadef_new[:10]

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,convex,scaly,yellow,no,foul,free,close,broad,chocolate,enlarging,...,silky,brown,buff,partial,white,one,large,chocolate,several,woods
1,bell,smooth,yellow,bruises,anise,free,close,broad,black,enlarging,...,smooth,white,white,partial,white,one,pendant,brown,numerous,grasses
2,convex,fibrous,brown,bruises,none,free,close,broad,purple,tapering,...,smooth,pink,white,partial,white,one,pendant,brown,solitary,woods
3,knobbed,smooth,brown,no,fishy,free,close,narrow,buff,tapering,...,smooth,white,pink,partial,white,one,evanescent,white,several,paths
4,flat,smooth,red,no,foul,free,close,narrow,buff,tapering,...,silky,white,pink,partial,white,one,evanescent,white,several,woods
5,flat,scaly,red,bruises,none,free,close,broad,pink,tapering,...,smooth,gray,gray,partial,white,one,pendant,black,solitary,woods
6,flat,smooth,gray,no,none,free,crowded,broad,brown,tapering,...,fibrous,white,white,partial,white,one,evanescent,brown,abundant,grasses
7,flat,scaly,brown,bruises,anise,free,close,broad,brown,enlarging,...,scaly,white,white,partial,white,one,pendant,brown,scattered,grasses
8,convex,smooth,white,no,none,free,crowded,broad,pink,tapering,...,smooth,white,white,partial,white,one,evanescent,brown,abundant,grasses
9,convex,smooth,brown,no,none,free,crowded,broad,black,tapering,...,smooth,white,white,partial,white,one,evanescent,brown,scattered,grasses
