In [6]:
simple_train= ['call me a cab','call you tonight','please call me .... Please']

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
vect= CountVectorizer()

In [13]:
#learn the vocabulary of the training data
vect.fit(simple_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [14]:
#examine the fitted vocabulary
vect.get_feature_names()

['cab', 'call', 'me', 'please', 'tonight', 'you']

In [17]:
#transforming training data into a 'document-term' matrix
simple_train_dtm= vect.transform(simple_train)
simple_train_dtm

<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [18]:
# convert sparse matrix to a dense matrix
simple_train_dtm.toarray()

array([[1, 1, 1, 0, 0, 0],
       [0, 1, 0, 0, 1, 1],
       [0, 1, 1, 2, 0, 0]])

In [20]:
import pandas as pd
pd.DataFrame(simple_train_dtm.toarray(),columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,1,1,1,0,0,0
1,0,1,0,0,1,1
2,0,1,1,2,0,0


In this scheme, features and samples are defined as follows:

Each individual token occurrence frequency (normalized or not) is treated as a feature.
The vector of all the token frequencies for a given document is considered a multivariate sample.

A corpus of documents can thus be represented by a matrix with one row per document and one column per token (e.g. word) occurring in the corpus.

We call vectorization the general process of turning a collection of text documents into numerical feature vectors. This specific strategy (tokenization, counting and normalization) is called the Bag of Words or "Bag of n-grams" representation. Documents are described by word occurrences while completely ignoring the relative position information of the words in the document.

In [21]:
type(simple_train_dtm)

scipy.sparse.csr.csr_matrix

In [23]:
print(simple_train_dtm)

  (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (1, 1)	1
  (1, 4)	1
  (1, 5)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	2


As most documents will typically use a very small subset of the words used in the corpus, the resulting matrix will have many feature values that are zeros (typically more than 99% of them).

For instance, a collection of 10,000 short text documents (such as emails) will use a vocabulary with a size in the order of 100,000 unique words in total while each document will use 100 to 1000 unique words individually.

In order to be able to store such a matrix in memory but also to speed up operations, implementations will typically use a sparse representation such as the implementations available in the scipy.sparse package.

In [24]:
simple_test=['please dont call me']
simple_test_dtm= vect.transform(simple_test)
simple_test_dtm.toarray()

array([[0, 1, 1, 1, 0, 0]])

In [25]:
pd.DataFrame(simple_test_dtm.toarray(),columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,1,1,0,0


Spam or ham classification of text messages

In [26]:
paurl = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv
sms= pd.read_table(url, header=None, names=['label','message'])

In [27]:
sms.shape

(5572, 2)

In [28]:
sms.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [29]:
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [30]:
sms['label_num']=sms.label.map({'ham':0, 'spam':1})

In [31]:
sms.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [32]:
x=sms.message
y=sms.label_num

In [33]:
#train_test_split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x,y,random_state=1) 

vectorize the dataset

In [34]:
vect=CountVectorizer()

In [35]:
#data vocabulary created by fitting
vect.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [37]:
# document term matrix created by transforming
x_train_dtm= vect.transform(x_train)

In [39]:
x_test_dtm= vect.transform(x_test)

building and evaluating a model

We will use multinomial Naive Bayes:

The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [40]:
from sklearn.naive_bayes import MultinomialNB
nb= MultinomialNB()

In [42]:
%time nb.fit(x_train_dtm, y_train)

CPU times: user 3.44 ms, sys: 1.77 ms, total: 5.21 ms
Wall time: 4.17 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [43]:
y_pred_class=nb.predict(x_test_dtm)

In [44]:
y_pred_class

array([0, 0, 0, ..., 0, 1, 0])

In [45]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.9885139985642498

In [46]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[1203,    5],
       [  11,  174]])

In [48]:
y_pred_prob=nb.predict_proba(x_test_dtm)[:,1]
y_pred_prob

array([2.87744864e-03, 1.83488846e-05, 2.07301295e-03, ...,
       1.09026171e-06, 1.00000000e+00, 3.98279868e-09])

In [49]:
metrics.roc_auc_score(y_test,y_pred_prob)

0.9866431000536962

compare multiniomial naibe bayes model with logistic regression

In [51]:
from sklearn.linear_model import LogisticRegression

In [52]:
logreg= LogisticRegression()

In [53]:
%time logreg.fit(x_train_dtm, y_train)

CPU times: user 75.7 ms, sys: 5.17 ms, total: 80.9 ms
Wall time: 29.9 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [54]:
y_pred_class= logreg.predict(x_test_dtm)

In [55]:
y_pred_prob= logreg.predict_proba(x_test_dtm)[:,1]

In [56]:
metrics.accuracy_score(y_test,y_pred_class)

0.9877961234745154

In [57]:
metrics.roc_auc_score(y_test, y_pred_prob)

0.9936817612314301

Fine tuning the vectorizer

In [63]:
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

stop_words: string {'english'}, list, or None (default)
If 'english', a built-in stop word list for English is used.
If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens.
If None, no stop words will be used.

In [64]:
vect=CountVectorizer(stop_words='english')

In [65]:

# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2))

In [66]:
# ignore terms that appear in more than 50% of the documents
vect = CountVectorizer(max_df=0.5)

In [67]:
# only keep terms that appear in at least 2 documents
vect = CountVectorizer(min_df=2)

In [69]:
vect.fit(x_train)
x_train_dtm=vect.transform(x_train)
x_test_dtm=vect.transform(x_test)

In [70]:
%time logreg.fit(x_train_dtm, y_train)
y_pred_class= logreg.predict(x_test_dtm)
y_pred_prob= logreg.predict_proba(x_test_dtm)[:,1]
metrics.roc_auc_score(y_test, y_pred_prob)

CPU times: user 18.3 ms, sys: 1.95 ms, total: 20.2 ms
Wall time: 18.8 ms


0.9937220332915697