In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import CategoricalNB
from sklearn import preprocessing

## LOAD DATA

In [2]:
data = pd.read_csv('mushrooms.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [4]:
data.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [5]:
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [6]:
data['class'].value_counts()

e    4208
p    3916
Name: class, dtype: int64

In [7]:
numerical_col = data.select_dtypes(['int64','float64'])
numerical_col

0
1
2
3
4
...
8119
8120
8121
8122
8123


In [8]:
categorical = data.select_dtypes(['object'])
categorical

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [9]:
data.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

## Train test split

In [10]:
label = data['class']
label

0       p
1       e
2       e
3       p
4       e
       ..
8119    e
8120    e
8121    e
8122    p
8123    e
Name: class, Length: 8124, dtype: object

In [11]:
type(label)

pandas.core.series.Series

- we need as an array

In [12]:
label = label.values

In [13]:
type(label)

numpy.ndarray

In [14]:
x = data.drop(['class'],axis = 1)
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   cap-shape                 8124 non-null   object
 1   cap-surface               8124 non-null   object
 2   cap-color                 8124 non-null   object
 3   bruises                   8124 non-null   object
 4   odor                      8124 non-null   object
 5   gill-attachment           8124 non-null   object
 6   gill-spacing              8124 non-null   object
 7   gill-size                 8124 non-null   object
 8   gill-color                8124 non-null   object
 9   stalk-shape               8124 non-null   object
 10  stalk-root                8124 non-null   object
 11  stalk-surface-above-ring  8124 non-null   object
 12  stalk-surface-below-ring  8124 non-null   object
 13  stalk-color-above-ring    8124 non-null   object
 14  stalk-color-below-ring  

In [15]:
x = x.values

In [16]:
from sklearn.model_selection import train_test_split

In [32]:
train_test_split?

In [17]:
X_train, X_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=42)
print(X_train.shape , y_train.shape)
print(X_test.shape , y_test.shape)

(6499, 22) (6499,)
(1625, 22) (1625,)


## Label Encoding

In [18]:
 ## we are label encoding each feature of the data setdef label_encoding(d):

def label_encoding(d):
    result = np.empty(d.shape)
    l_encoders = []
    for i in range(d.shape[1]):
        le = preprocessing.LabelEncoder()
        result[:,i] = le.fit_transform(d[:,i])
        l_encoders.append(le)
    return l_encoders, result
        

### label encoding of training data

In [19]:
les , xtrain = label_encoding(X_train)
print(xtrain.shape)

(6499, 22)


In [20]:
xtrain

array([[3., 2., 2., ..., 7., 4., 0.],
       [5., 2., 4., ..., 7., 4., 4.],
       [2., 3., 2., ..., 7., 4., 2.],
       ...,
       [2., 3., 4., ..., 3., 5., 4.],
       [3., 2., 2., ..., 7., 4., 4.],
       [3., 0., 3., ..., 7., 2., 1.]])

In [21]:
print(np.unique(xtrain[:,0]))

[0. 1. 2. 3. 4. 5.]


In [22]:
print(len(les))
for i, le in enumerate(les):
    print(i, le.classes_)

22
0 ['b' 'c' 'f' 'k' 's' 'x']
1 ['f' 'g' 's' 'y']
2 ['b' 'c' 'e' 'g' 'n' 'p' 'r' 'u' 'w' 'y']
3 ['f' 't']
4 ['a' 'c' 'f' 'l' 'm' 'n' 'p' 's' 'y']
5 ['a' 'f']
6 ['c' 'w']
7 ['b' 'n']
8 ['b' 'e' 'g' 'h' 'k' 'n' 'o' 'p' 'r' 'u' 'w' 'y']
9 ['e' 't']
10 ['?' 'b' 'c' 'e' 'r']
11 ['f' 'k' 's' 'y']
12 ['f' 'k' 's' 'y']
13 ['b' 'c' 'e' 'g' 'n' 'o' 'p' 'w' 'y']
14 ['b' 'c' 'e' 'g' 'n' 'o' 'p' 'w' 'y']
15 ['p']
16 ['n' 'o' 'w' 'y']
17 ['n' 'o' 't']
18 ['e' 'f' 'l' 'n' 'p']
19 ['b' 'h' 'k' 'n' 'o' 'r' 'u' 'w' 'y']
20 ['a' 'c' 'n' 's' 'v' 'y']
21 ['d' 'g' 'l' 'm' 'p' 'u' 'w']


In [23]:
## label encoding of X_test
x_test = np.empty(X_test.shape)
for i in range(X_test.shape[1]):
    le = les[i]
    x_test[:,i] = le.transform(X_test[:,i])  

In [24]:
x_test

array([[2., 0., 4., ..., 3., 3., 1.],
       [2., 2., 2., ..., 7., 4., 2.],
       [5., 3., 4., ..., 7., 4., 2.],
       ...,
       [5., 3., 4., ..., 7., 4., 4.],
       [3., 3., 4., ..., 7., 4., 4.],
       [5., 0., 9., ..., 1., 5., 4.]])

In [25]:
np.unique(labels)
map_ = {'e':0, 'p':1}
Y_train = np.array([map_[el] for el in y_train])

NameError: name 'labels' is not defined