In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
texts = np.array([
    ["Apple", "red  sweetness  skin  seeds"],
    ["Strawberry", "red  sweetness  sourness  sweetness"],
    ["Orange", "orange  skin  sourness  sweetness"]
])
df = pd.DataFrame(texts, columns = ['category','text'])
X_train = df['text']
y_train = df['category']
print(X_train)
print(y_train)

0            red  sweetness  skin  seeds
1    red  sweetness  sourness  sweetness
2      orange  skin  sourness  sweetness
Name: text, dtype: object
0         Apple
1    Strawberry
2        Orange
Name: category, dtype: object


In [3]:
X_test = ['skin sweetness sourness']
print(X_test)
vectorizer = CountVectorizer(token_pattern='(?u)\\b\\w+\\b')
vectorizer.fit(X_train)
vocab = vectorizer. get_feature_names()
print('Vocabulary size:', len(vocab))
print(vocab)

['skin sweetness sourness']
Vocabulary size: 6
['orange', 'red', 'seeds', 'skin', 'sourness', 'sweetness']


In [4]:
X_train_bow = vectorizer.transform(X_train)
X_test_bow = vectorizer.transform(X_test)
print('X_train_bow:')
print(repr(X_train_bow))
print('X_test_bow:')
print(repr(X_test_bow))

X_train_bow:
<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>
X_test_bow:
<1x6 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>


In [5]:
Xbow = pd.DataFrame(X_train_bow.toarray(), 
                    index=y_train, columns=vocab)
display(Xbow)

Unnamed: 0_level_0,orange,red,seeds,skin,sourness,sweetness
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Apple,0,1,1,1,0,1
Strawberry,0,1,0,0,1,2
Orange,1,0,0,1,1,1


In [6]:
model = MultinomialNB(alpha=1.0)
model.fit(X_train_bow, y_train)
print(model.classes_)
train_score = model.score(X_train_bow, y_train)
print('Train accuracy:', train_score)

['Apple' 'Orange' 'Strawberry']
Train accuracy: 1.0


In [7]:
proba = model.predict_proba(X_test_bow)
results = pd.DataFrame(proba, columns=model.classes_)
print('Prediction:')
display(results)
print(model.predict(X_test_bow))

Prediction:


Unnamed: 0,Apple,Orange,Strawberry
0,0.222222,0.444444,0.333333


['Orange']
