In [2]:
import pandas as pd
# Dataset from - https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
df = pd.read_table('smsspamcollection/SMSSpamCollection',
                   sep='\t', 
                   header=None, 
                   names=['label', 'sms_message'])
df['label'] = df.label.map({'ham':0, 'spam':1})
print(df.shape)
df.head()

(5572, 2)


Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
print(count_vector)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


By default, count_vector apply lowercase to the text and remove the punctuation. Usually if stop_words is set to English, then frequent words are removed. By default is None.


In [4]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


In [6]:
# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

Multinomial Naive Bayes is suitable for classification with discrete features. It takes in integer word counts as its input. On the other hand Gaussian Naive Bayes is better suited for continuous data as it assumes that the input data has a Gaussian(normal) distribution.


In [7]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)
predictions = naive_bayes.predict(testing_data)

In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def print_res(y_test, predictions):
    print('Accuracy score: %0.2f%%' % (accuracy_score(y_test, predictions) * 100))
    print('Precision score: %0.2f%%' % (precision_score(y_test, predictions) * 100))
    print('Recall score: %0.2f%%' % (recall_score(y_test, predictions) * 100))
    print('F1 score: %0.2f%%' % (f1_score(y_test, predictions) * 100))
print_res(y_test, predictions)

Accuracy score: 98.85%
Precision score: 97.21%
Recall score: 94.05%
F1 score: 95.60%


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
training_data_idf = tfidf_vectorizer.fit_transform(X_train)
testing_data_idf = tfidf_vectorizer.transform(X_test)
naive_bayes_idf = MultinomialNB()
naive_bayes_idf.fit(training_data_idf, y_train)
predictions_idf = naive_bayes.predict(testing_data_idf)
print_res(y_test, predictions_idf)


Accuracy score: 98.85%
Precision score: 100.00%
Recall score: 91.35%
F1 score: 95.48%


In [19]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
training_data_idf = tfidf_vectorizer.fit_transform(X_train)
testing_data_idf = tfidf_vectorizer.transform(X_test)
naive_bayes_idf = MultinomialNB()
naive_bayes_idf.fit(training_data_idf, y_train)
predictions_idf = naive_bayes.predict(testing_data_idf)
print_res(y_test, predictions_idf)

ValueError: dimension mismatch