# Multinomial Naive Bayes Model

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

df = pd.read_csv('../sample.csv')

The MNB will want integer values, so we'll multiply by a factor of 100 here to get rid of the hundreths place in the ratings:

In [16]:
df['rating100factor'] = df.rating.apply(lambda v: int(v * 100))

Count word occurances, vectorize occurances and then get our data sets:

In [17]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    lowercase = False
)
features = vectorizer.fit_transform(df.comment.tolist())

X_train, X_test, y_train, y_test = \
    train_test_split(
        features.toarray(),
        df.rating100factor.tolist(),
        test_size=0.30,
        random_state=10,
    )

Create our first Multinomial Naive Bayes model, feed it the word counts:

In [19]:
mnb_model = MultinomialNB()
mnb_model = mnb_model.fit(X_train, y_train)
y_pred_mnb = mnb_model.predict(X_test)

from sklearn.metrics import precision_score, accuracy_score

print(accuracy_score(y_test, y_pred_mnb))

0.2627450980392157
0.2627450980392157


The first MNB model appears to be ~75% precise.

In [5]:
# TODO: graph and continue expanding MNB model(s)