In [39]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

2

In [13]:
data = pd.read_csv('./smsspamcollection/SMSSpamCollection.txt', sep='\t', names=['label','msg'])
data.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


3

In [24]:
X = data.msg.values
y = data.label.apply(lambda m: 1 if m == 'spam' else 0).values

4

In [26]:
vect = CountVectorizer() 
X_transformed = vect.fit_transform(X)

5

In [27]:
cross_val_score(LogisticRegression(), X_transformed, y, scoring='f1', cv=10).mean()

0.9326402983610631

6

In [29]:
X_test = ["FreeMsg:	Txt:	CALL	to	No:	86888	&	claim	your	reward	of	3	hours	talk	time	to	use	from	your	phone	now!	Subscribe6GB",
"FreeMsg:	Txt:	claim	your	reward	of	3	hours	talk	time",
"Have	you	visited	the	last	lecture	on	physics?",
"Have	you	visited	the	last	lecture	on	physics?	Just	buy	this	book	and	you	will	have	all	materials!	Only	99$",
"Only	99$"]

clf = LogisticRegression().fit(X_transformed, y)
X_test_transformed = vect.transform(X_test)
print ' '.join(map(str, clf.predict(X_test_transformed)))

1 1 0 0 0


7

In [30]:
results = []
results.append(cross_val_score(LogisticRegression(), CountVectorizer(ngram_range=(2,2)).fit_transform(X), y, scoring='f1', cv=10).mean())
results.append(cross_val_score(LogisticRegression(), CountVectorizer(ngram_range=(3,3)).fit_transform(X), y, scoring='f1', cv=10).mean())
results.append(cross_val_score(LogisticRegression(), CountVectorizer(ngram_range=(1,3)).fit_transform(X), y, scoring='f1', cv=10).mean())
print ' '.join(map(lambda f: '{0:.2f}'.format(f), results))

0.82 0.73 0.93


8

In [31]:
results = []
results.append(cross_val_score(MultinomialNB(), CountVectorizer(ngram_range=(2,2)).fit_transform(X), y, scoring='f1', cv=10).mean())
results.append(cross_val_score(MultinomialNB(), CountVectorizer(ngram_range=(3,3)).fit_transform(X), y, scoring='f1', cv=10).mean())
results.append(cross_val_score(MultinomialNB(), CountVectorizer(ngram_range=(1,3)).fit_transform(X), y, scoring='f1', cv=10).mean())
print ' '.join(map(lambda f: '{0:.2f}'.format(f), results))

0.65 0.38 0.89


9

In [32]:
cross_val_score(LogisticRegression(), TfidfVectorizer().fit_transform(X), y, scoring='f1', cv=10).mean()

0.85285995541724557

10

In [41]:
%%time
pipeline = Pipeline([
        ('vect', TfidfVectorizer()),
        ('clf', LogisticRegression())
    ])

params_grid = [{
        'vect__ngram_range': [(1,1), (1,2), (1,3), (1,4)],
        'vect__binary': [True, False],
        'vect__min_df': [0.01, 0.03, 0.05],
        'vect__max_df': [0.1, 0.15, 0.2],
        'clf__penalty': ['l1', 'l2'],
    }]

grid_search = GridSearchCV(pipeline, params_grid, scoring='f1', cv=10)
grid_search.fit(X, y)

print grid_search.best_score_
print grid_search.best_params_

0.913447831757
{'vect__ngram_range': (1, 1), 'clf__penalty': 'l1', 'vect__binary': True, 'vect__min_df': 0.01, 'vect__max_df': 0.2}
CPU times: user 17min 23s, sys: 24.6 s, total: 17min 48s
Wall time: 18min 27s


In [44]:
%%time

pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('clf', LogisticRegression())
    ])

params_grid = [{
        'vect__ngram_range': [(1,1), (1,2), (1,3)],
        'vect__binary': [True, False],
        'vect__max_df': [0.1, 0.2],
        'clf__penalty': ['l1', 'l2']
    }]

grid_search = GridSearchCV(pipeline, params_grid, scoring='f1', cv=10)
grid_search.fit(X, y)

print grid_search.best_score_
print grid_search.best_params_

0.935834740277
{'vect__ngram_range': (1, 2), 'clf__penalty': 'l2', 'vect__binary': False, 'vect__max_df': 0.2}
CPU times: user 2min 49s, sys: 3.07 s, total: 2min 52s
Wall time: 2min 18s


11

1) Линейные и байессовские модели неплохо справляются с классификацией текстов

2) Tfidf не всегда лучше bag-of-words

3) Байессовский классификатор работает лучше при бОльших статистиках

4) Не стоит бояться добавлять n-граммы больших размеров