In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [2]:
msg = pd.read_csv("naivetext.csv", names=["message", "label"])
print("The dimensions of the dataset", msg.shape)
msg["labelnum"] = msg.label.map({"pos": 1, "neg": 0})
X = msg.message
y = msg.labelnum

The dimensions of the dataset (18, 2)


In [3]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y)
print(xtest.shape)
print(xtrain.shape)
print(ytest.shape)
print(ytrain.shape)
print("train data")
print(xtrain)

(5,)
(13,)
(5,)
(13,)
train data
2        I feel very good about these beers
4                      What an awesome view
7                    I can't deal with this
12                          I love to dance
1                  This is an amazing place
16           We will have good fun tomorrow
9                       My boss is horrible
0                      I love this sandwich
15           That is a bad locality to stay
14                     What a great holiday
17         I went to my enemy's house today
11    I do not like the taste of this juice
5             I do not like this restaurant
Name: message, dtype: object


In [4]:
# Output of the count vectoriser is a sparse matrix.
count_vect = CountVectorizer()
xtrain_dtm = count_vect.fit_transform(xtrain)
xtest_dtm = count_vect.transform(xtest)
print(count_vect.get_feature_names())
df = pd.DataFrame(xtrain_dtm.toarray(), columns=count_vect.get_feature_names())
print(df)  # Tabular representation.
print(xtrain_dtm)  # Sparse matrix representation.

['about', 'amazing', 'an', 'awesome', 'bad', 'beers', 'boss', 'can', 'dance', 'deal', 'do', 'enemy', 'feel', 'fun', 'good', 'great', 'have', 'holiday', 'horrible', 'house', 'is', 'juice', 'like', 'locality', 'love', 'my', 'not', 'of', 'place', 'restaurant', 'sandwich', 'stay', 'taste', 'that', 'the', 'these', 'this', 'to', 'today', 'tomorrow', 'very', 'view', 'we', 'went', 'what', 'will', 'with']
    about  amazing  an  awesome  bad  beers  boss  can  dance  deal  ...  to  \
0       1        0   0        0    0      1     0    0      0     0  ...   0   
1       0        0   1        1    0      0     0    0      0     0  ...   0   
2       0        0   0        0    0      0     0    1      0     1  ...   0   
3       0        0   0        0    0      0     0    0      1     0  ...   1   
4       0        1   1        0    0      0     0    0      0     0  ...   0   
5       0        0   0        0    0      0     0    0      0     0  ...   0   
6       0        0   0        0    0    

In [5]:
# Training Naive Bayes (NB) classifier on training data.
clf = MultinomialNB().fit(xtrain_dtm, ytrain)
predicted = clf.predict(xtest_dtm)

In [6]:
# Printing the accuracy metrics.
print("Accuracy metrics")
print("Accuracy of the classifer is", metrics.accuracy_score(ytest, predicted))
print("Confusion matrix")
print(metrics.confusion_matrix(ytest, predicted))
print("Recall and Precison ")
print(metrics.recall_score(ytest, predicted))
print(metrics.precision_score(ytest, predicted))

Accuracy metrics
Accuracy of the classifer is 0.8
Confusion matrix
[[3 0]
 [1 1]]
Recall and Precison 
0.5
1.0


In [7]:
"""docs_new = ['I like this place', 'My boss is not my saviour']
X_new_counts = count_vect.transform(docs_new)
predictednew = clf.predict(X_new_counts)
for doc, category in zip(docs_new, predictednew):
print('%s-&gt;%s' % (doc, msg.labelnum[category]))"""

"docs_new = ['I like this place', 'My boss is not my saviour']\nX_new_counts = count_vect.transform(docs_new)\npredictednew = clf.predict(X_new_counts)\nfor doc, category in zip(docs_new, predictednew):\nprint('%s-&gt;%s' % (doc, msg.labelnum[category]))"