In [1]:
import pandas as pd

In [2]:
df = pd.read_table('complaints.tsv', header=None, names=['label', 'message'])

In [3]:
df.head()

Unnamed: 0,label,message
0,complaint,broken tubelight
1,complaint,broken fan
2,complaint,tube light broke
3,complaint,room cleaning
4,complaint,clean the room


In [4]:
df.shape

(1418, 2)

In [5]:
df.label.value_counts()

spam         747
complaint    671
Name: label, dtype: int64

In [6]:
df['label_num'] = df.label.map({'complaint':1, 'spam':0})

In [7]:
df.head()

Unnamed: 0,label,message,label_num
0,complaint,broken tubelight,1
1,complaint,broken fan,1
2,complaint,tube light broke,1
3,complaint,room cleaning,1
4,complaint,clean the room,1


In [8]:
X = df.message
y = df.label_num

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [11]:
X_train_dtm = vect.fit_transform(X_train)

In [12]:
X_test_dtm = vect.transform(X_test)

In [13]:
# Follow IIFP - Import, Instantiate, Fit, Predict
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [14]:
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
y_pred_class = nb.predict(X_test_dtm)

In [16]:
# calculate accuracy of class predictions
from sklearn import metrics
print('\nTesting accuracy of Naive Bayes model is', metrics.accuracy_score(y_test, y_pred_class))


Testing accuracy of Naive Bayes model is 0.997183098592


In [17]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[193,   0],
       [  1, 161]])

In [18]:
X_test[y_pred_class < y_test.values]

65    steps paint spill
Name: message, dtype: object

In [19]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('nb2', MultinomialNB())])
text_clf = text_clf.fit(X, y)

In [20]:
# joblib is more efficient at handling numpy arrays than python pickler.
from sklearn.externals import joblib
_ = joblib.dump(text_clf, 'spam_data_pugaar.pkl', compress = 9)
print('Done pickling...')

Done pickling...
