https://ogrisel.github.io/scikit-learn.org/sklearn-tutorial/modules/generated/sklearn.naive_bayes.MultinomialNB.html

In [9]:
import sklearn
import numpy as np
import pandas as pd
import pickle
from sklearn.naive_bayes import MultinomialNB

In [4]:
data = None
column_names = ['fruit', 'long', 'sweet', 'yellow', 'seed', 'Brazil']
with open('../fruit_data.pkl', 'rb') as fin:
    data = pickle.load(fin)
df = pd.DataFrame(data, columns=column_names)

In [13]:
df.head()

Unnamed: 0,fruit,long,sweet,yellow,seed,Brazil
0,others,0,1,0,1,0
1,bananas,1,0,1,0,0
2,bananas,1,0,1,0,0
3,bananas,0,1,1,0,1
4,oranges,0,1,0,1,0


In [6]:
df.long = pd.to_numeric(df.long)
df.sweet = pd.to_numeric(df.sweet)
df.yellow = pd.to_numeric(df.yellow)
df.seed = pd.to_numeric(df.seed)
df.Brazil = pd.to_numeric(df.Brazil)

In [19]:
X = df.loc[:,df.columns != 'fruit']
y = df.fruit

In [20]:
clf = MultinomialNB()
clf.fit(X, y) # <- train

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## What happens when we add a categorical variable which is not numeric

In [22]:
X

Unnamed: 0,long,sweet,yellow,seed,Brazil
0,0,1,0,1,0
1,1,0,1,0,0
2,1,0,1,0,0
3,0,1,1,0,1
4,0,1,0,1,0
...,...,...,...,...,...
1395,0,1,0,1,0
1396,0,0,0,0,1
1397,1,1,1,0,0
1398,0,1,0,1,0


In [28]:
import random
arr_type = ['organic'] * 245 + ['bio']* 315 + ['normal'] * (df.shape[0] - 245 - 315)
random.shuffle(arr_type)

X['type'] = arr_type

In [29]:
X.head()

Unnamed: 0,long,sweet,yellow,seed,Brazil,type
0,0,1,0,1,0,normal
1,1,0,1,0,0,normal
2,1,0,1,0,0,organic
3,0,1,1,0,1,bio
4,0,1,0,1,0,normal


Let's train again ...

In [30]:
clf.fit(X, y) # <- train

ValueError: could not convert string to float: 'normal'

UH OH ...
 
Sklearn needs all the feature values to be encoded to numerical values. Let's do this then

In [34]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer 

In [36]:
# creating one hot encoder object with categorical feature 5
# indicating the first column 
columnTransformer = ColumnTransformer([('encoder', 
                                        OneHotEncoder(), 
                                        [5])], 
                                      remainder='passthrough')
X = np.array(columnTransformer.fit_transform(X), dtype = np.str) 

In [39]:
X

array([['0.0', '1.0', '0.0', ..., '0.0', '1.0', '0.0'],
       ['0.0', '1.0', '0.0', ..., '1.0', '0.0', '0.0'],
       ['0.0', '0.0', '1.0', ..., '1.0', '0.0', '0.0'],
       ...,
       ['0.0', '1.0', '0.0', ..., '1.0', '0.0', '0.0'],
       ['0.0', '1.0', '0.0', ..., '0.0', '1.0', '0.0'],
       ['0.0', '0.0', '1.0', ..., '0.0', '1.0', '0.0']], dtype='<U32')

Man, numpy arrays again ... 🤣

In [38]:
X.shape

(1400, 8)

Why are there 8 columns now, what do you think happened?

In [43]:
clf.fit(X, y) # <- train

  estimator=estimator)


TypeError: cannot perform reduce with flexible type

What?
Again ... I cannot train my model...

In [46]:
clf.fit(X.astype(np.float), y) # <- train

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

👍