In [1]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
from nltk import word_tokenize
from nltk import stem
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn import preprocessing

Загружаем данные

In [2]:
path = 'smsspamcollection/SMSSpamCollection'
messages = pandas.read_csv(path, sep='\t',
                           names=["label", "message"])

Делаем лейблы бинарными числами, просто так удобнее

In [3]:
messages['bin_label'] = 0

In [4]:
messages.loc[messages['label'] == 'spam', 'bin_label'] = 1

In [5]:
messages.head()

Unnamed: 0,label,message,bin_label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


Мы видим, что датасет не сбалансирован -- неспама сильно больше

In [6]:
print(messages.groupby('bin_label').describe())

                 label                                            message
bin_label                                                                
0         count   4825                                               4825
          unique     1                                               4516
          top      ham                             Sorry, I'll call later
          freq    4825                                                 30
1         count    747                                                747
          unique     1                                                653
          top     spam  Please call our customer service representativ...
          freq     747                                                  4


Проверим, как будет работать dummy:

In [7]:
msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['bin_label'], test_size=0.2)

In [8]:
label_dummy = [0 for _ in range(len(label_test))]

In [9]:
accuracy_score(label_test, label_dummy)

0.85291479820627802

In [10]:
f1_score(label_test, label_dummy)

  'precision', 'predicted', average, warn_for)


0.0

Как будто бы неплохая точность, но на самом деле это просто оттого что у нас в тесте спама меньше чем неспама.

Делаем так, чтобы было попровну спама и неспама

In [11]:
spam = messages[messages['bin_label'] == 1]

In [12]:
len_of_spam = len(spam)

In [13]:
ham = messages[messages['bin_label'] == 0].sample(len_of_spam)

In [14]:
msg_train_spam, msg_test_spam, label_train_spam, label_test_spam = train_test_split(spam['message'], spam['bin_label'], test_size=0.2)

In [15]:
msg_train_ham, msg_test_ham, label_train_ham, label_test_ham = train_test_split(ham['message'], ham['bin_label'], test_size=0.2)

In [16]:
msg_train = pandas.concat([msg_train_ham, msg_train_spam])
msg_test = pandas.concat([msg_test_ham, msg_test_spam])
label_train = pandas.concat([label_train_ham, label_train_spam])
label_test = pandas.concat([label_test_ham, label_test_spam])

сначала делаем простой токенайзер

In [17]:
def tokenize(text):
    text = text.lower()
    all_tokens = word_tokenize(text)
    return all_tokens

и простой классификатор

In [18]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=tokenize)),
    ('classifier', MultinomialNB()),
])

In [19]:
cv_results = cross_val_score(pipeline,
                             msg_train,
                             label_train,
                             cv=10,
                             scoring='accuracy',
                             )

In [20]:
print(cv_results.mean(), cv_results.std())

0.954774011299 0.0150324958953


In [21]:
pipeline.fit(msg_train, label_train)

Pipeline(steps=[('bow', CountVectorizer(analyzer=<function tokenize at 0x7fd773c34e18>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None,
        stop_words=None, strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [22]:
label_pred = pipeline.predict(msg_test)

In [23]:
accuracy_score(label_test, label_pred)

0.96666666666666667

In [24]:
f1_score(label_test, label_pred)

0.96666666666666667

Попробуем в токенизации брать только токены где есть буквы

In [25]:
def tokenize(text):
    text = text.lower()
    all_tokens = word_tokenize(text)
    return [token for token in all_tokens if any(char.isalpha() for char in token)]

In [26]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=tokenize)),
    ('classifier', MultinomialNB()),
])
pipeline.fit(msg_train, label_train)
label_pred = pipeline.predict(msg_test)
accuracy_score(label_test, label_pred), f1_score(label_test, label_pred)

(0.95666666666666667, 0.95681063122923593)

Теперь добавим стемминг

In [27]:
snowball = stem.snowball.EnglishStemmer()
def tokenize(text):
    text = text.lower()
    all_tokens = word_tokenize(text)
    only_with_letters = [token for token in all_tokens if any(char.isalpha() for char in token)]
    stems = [snowball.stem(token) for token in only_with_letters]
    return stems

In [28]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=tokenize)),
    ('classifier', MultinomialNB()),
])
pipeline.fit(msg_train, label_train)
label_pred = pipeline.predict(msg_test)
accuracy_score(label_test, label_pred), f1_score(label_test, label_pred)

(0.96333333333333337, 0.96321070234113715)

Добавим стоп слова

In [29]:
en_stopwords = stopwords.words('english')
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=tokenize,
                            stop_words=en_stopwords)),
    ('classifier', MultinomialNB()),
])
pipeline.fit(msg_train, label_train)
label_pred = pipeline.predict(msg_test)
accuracy_score(label_test, label_pred), f1_score(label_test, label_pred)

(0.96333333333333337, 0.96321070234113715)

Добавим tf-idf

In [30]:
en_stopwords = stopwords.words('english')
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=tokenize,
                            stop_words=en_stopwords)),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB()),
])
pipeline.fit(msg_train, label_train)
label_pred = pipeline.predict(msg_test)
accuracy_score(label_test, label_pred), f1_score(label_test, label_pred)

(0.95999999999999996, 0.96026490066225156)

А тут я выяснил, что меняя max_df почти ничего не меняется. А вот изменяя min_df можно менять точность -- чем выше берем min_df, тем хуже становится точность

In [32]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=tokenize,
                            stop_words=en_stopwords,
                            max_df=0.9,
                            min_df=0.05)),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB()),
])
pipeline.fit(msg_train, label_train)
label_pred = pipeline.predict(msg_test)
accuracy_score(label_test, label_pred), f1_score(label_test, label_pred)

(0.89666666666666661, 0.8996763754045306)