In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [2]:
# read file into pandas using a relative path
path = 'data/example.tsv'
data = pd.read_table(path, header=None, names=['label', 'message'])

In [3]:
data.shape

(999, 2)

In [4]:
1 - data.message.mean()

0.5005005005005005

In [5]:
X = data.label
y = data.message

In [6]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(749,)
(250,)
(749,)
(250,)


In [7]:
# instantiate the vectorizer
vect = CountVectorizer()

In [19]:
# learn training data vocabulary, then use it to create a document-term matrix
vect.fit(X_train)
print(vect)

  (0, 335)	1
  (0, 1182)	1
  (0, 1317)	1
  (0, 1461)	1
  (1, 52)	1
  (1, 202)	1
  (1, 395)	1
  (1, 596)	1
  (1, 637)	1
  (1, 700)	1
  (1, 709)	1
  (1, 712)	2
  (1, 760)	1
  (1, 769)	1
  (1, 852)	1
  (1, 913)	1
  (1, 1236)	1
  (1, 1323)	4
  (1, 1333)	1
  (1, 1425)	1
  (1, 1489)	1
  (2, 400)	1
  (2, 848)	1
  (2, 1463)	1
  (2, 1541)	1
  :	:
  (746, 400)	1
  (746, 410)	1
  (746, 467)	1
  (746, 624)	1
  (746, 632)	1
  (746, 679)	1
  (746, 712)	2
  (746, 798)	1
  (746, 913)	1
  (746, 1067)	1
  (746, 1184)	1
  (746, 1323)	1
  (746, 1357)	1
  (746, 1432)	1
  (746, 1488)	1
  (747, 469)	1
  (747, 512)	1
  (747, 868)	1
  (747, 905)	1
  (747, 915)	1
  (747, 1062)	1
  (748, 975)	1
  (748, 1014)	1
  (748, 1314)	1
  (748, 1349)	1


In [9]:
# equivalently: combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)

In [10]:
# examine the document-term matrix
X_train_dtm

<749x1543 sparse matrix of type '<class 'numpy.int64'>'
	with 6812 stored elements in Compressed Sparse Row format>

In [11]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<250x1543 sparse matrix of type '<class 'numpy.int64'>'
	with 1974 stored elements in Compressed Sparse Row format>

In [12]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [13]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

CPU times: user 5.1 ms, sys: 4.36 ms, total: 9.47 ms
Wall time: 5.46 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [15]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.792

In [16]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[100,  33],
       [ 19,  98]])

In [17]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]

In [18]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.8717306085727139