In [5]:
# In this example we try to predict user inputs based
# upon a Naive Bayes language model
#
# Author: Fabrício Galende Marques de Carvalho


from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.naive_bayes import MultinomialNB
import numpy as np


documents = [ "pizza para delivery em são josé dos campos",
              "valor da entrega para meu bairro",
              "comida muito saborosa",
              "pizza são josé dos campos",
              "quanto custa um prato de comida aqui"
            ]
# Now we must map previous and next words
current_word = []    #word k
next_word = []      #word k+1

for doc in documents:
    words = doc.split()
    for i in range(len(words)-1):
        current_word.append(words[i])   # current word
        next_word.append(words[i+1])    # next word

print("Current word:", current_word,"\n")
print("Next word:", next_word, "\n")

# Cast lists to arrays
current_word_arr = np.array(current_word).reshape(-1,1)
next_word_arr = np.array(next_word)

# One-hot encode for features (current words)
encoder_current = OneHotEncoder()
X = encoder_current.fit_transform(current_word_arr)

# Encode labels for predictions (next word) 
encoder_next = LabelEncoder()
y = encoder_next.fit_transform(next_word_arr)

# Here we train the Naive Bayes Model
model = MultinomialNB()
model.fit(X, y)

# next word prediction:
input_word = "pizza"
X_input = encoder_current.transform([[input_word]])
pred_index = model.predict(X_input)
predicted_word = encoder_next.inverse_transform(pred_index)
print(f"Predicted word after '{input_word}': {predicted_word[0]}")



Current word: ['pizza', 'para', 'delivery', 'em', 'são', 'josé', 'dos', 'valor', 'da', 'entrega', 'para', 'meu', 'comida', 'muito', 'pizza', 'são', 'josé', 'dos', 'quanto', 'custa', 'um', 'prato', 'de', 'comida'] 

Next word: ['para', 'delivery', 'em', 'são', 'josé', 'dos', 'campos', 'da', 'entrega', 'para', 'meu', 'bairro', 'muito', 'saborosa', 'são', 'josé', 'dos', 'campos', 'custa', 'um', 'prato', 'de', 'comida', 'aqui'] 

Predicted word after 'pizza': para


  self.y_type_ = type_of_target(y, input_name="y")
  ys_types = set(type_of_target(x) for x in ys)
  y_is_multilabel = type_of_target(y).startswith("multilabel")
  y_type = type_of_target(y)


In [4]:
# Let`s examine all the probabilities related to the input
probs = model.predict_proba(X_input)[0]
sorted_idx = np.argsort(probs)[::-1]
for idx in sorted_idx:
    word = encoder_next.classes_[idx]
    p = probs[idx]
    print(f"{word}: {p:.3f}")


são: 0.139
para: 0.139
josé: 0.070
dos: 0.070
campos: 0.070
saborosa: 0.037
um: 0.037
meu: 0.037
muito: 0.037
entrega: 0.037
prato: 0.037
em: 0.037
delivery: 0.037
da: 0.037
de: 0.037
custa: 0.037
comida: 0.037
bairro: 0.037
aqui: 0.037
aqui: 0.037
bairro: 0.037
campos: 0.070
comida: 0.037
custa: 0.037
da: 0.037
de: 0.037
delivery: 0.037
dos: 0.070
em: 0.037
entrega: 0.037
josé: 0.070
meu: 0.037
muito: 0.037
para: 0.139
prato: 0.037
saborosa: 0.037
são: 0.139
um: 0.037
