In [45]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [33]:
# 1, 2, 3

def get_series_ids(x):
    values = np.unique(x)
    values2nums = dict(zip(values,range(len(values))))
    return x.replace(values2nums)

data = pd.read_csv("./SMSSpamCollection", sep='\t', header=None, names=('type', 'message'))
data['booltype'] = get_series_ids(data['type'])

data.head()

Unnamed: 0,type,message,booltype
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [43]:
# 4

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data.message)

# 5

cls = LogisticRegression()
result = cross_val_score(cls, X, data.booltype, scoring="f1", cv=10)
print("Average score = {}, std = {}".format(np.mean(result), np.std(result)))

Average score = 0.9326402983610631, std = 0.019563821457512873


In [36]:
# 6


test_messages = vectorizer.transform(
                ["FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB",            
                "FreeMsg: Txt: claim your reward of 3 hours talk time",
                "Have you visited the last lecture on physics?",
                "Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$",
                "Only 99$"
                ])

cls.fit(X, data.booltype)
cls.predict(test_messages)

array([1, 1, 0, 0, 0])

In [30]:
# 7

ngram_parameters = ((2, 2), (3, 3), (1, 3))
for ngramm_parameter in ngram_parameters:
    vectorizer = CountVectorizer(ngram_range=ngramm_parameter)
    X = vectorizer.fit_transform(data.message)
    cls = LogisticRegression()
    result = cross_val_score(cls, X, data.booltype, scoring="f1", cv=10)
    print("{:.2}".format(np.mean(result)), end=' ')

0.82 0.73 0.93 

In [39]:
# 8

ngram_parameters = ((2, 2), (3, 3), (1, 3))
for ngramm_parameter in ngram_parameters:
    vectorizer = CountVectorizer(ngram_range=ngramm_parameter)
    X = vectorizer.fit_transform(data.message)
    cls = MultinomialNB()
    result = cross_val_score(cls, X, data.booltype, scoring="f1", cv=10)
    print("{:.2}".format(np.mean(result)), end=' ')

0.65 0.38 0.89 

In [42]:
# 9

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data.message)

cls = LogisticRegression()
result = cross_val_score(cls, X, data.booltype, scoring="f1", cv=10)
print("Average score = {}, std = {}".format(np.mean(result), np.std(result)))

Average score = 0.8528599554172456, std = 0.023836421522097122


In [51]:
# 10

ngramm_parameters = [(1, 1), (1, 2), (1, 3), (2, 2), (3, 3), (2, 3)]

for ngramm_parameter in ngramm_parameters:

    print("ngramm = {}:\n".format(ngramm_parameter))
    
    vectorizer = CountVectorizer(ngram_range=ngramm_parameter)
    X = vectorizer.fit_transform(data.message)

    tuned_parameters = [{"C":[0.01, 0.1, 0.5, 1, 5, 10, 100, 200, 500, 1000, 10000, 15000, 20000, 100000],
                        "penalty":["l1", "l2"],
                        "class_weight":["balanced", None]}]

    clf = GridSearchCV(LogisticRegression(), tuned_parameters, cv=10, scoring="f1")
    clf.fit(X, data.booltype)

    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("{} (+/-{}) for {}".format(mean, std * 2, params))
        
    print("\n\n")

ngramm = (1, 1):

0.6795479148701923 (+/-0.1010548213270063) for {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l1'}
0.9030349566448546 (+/-0.02621385937695146) for {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l2'}
0.18482244854281868 (+/-0.12098200276467706) for {'C': 0.01, 'class_weight': None, 'penalty': 'l1'}
0.7836881367443778 (+/-0.10121237514203307) for {'C': 0.01, 'class_weight': None, 'penalty': 'l2'}
0.868247011008705 (+/-0.04655922460507054) for {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1'}
0.92788983623875 (+/-0.02877367741280242) for {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l2'}
0.8348155692909052 (+/-0.07541402022425826) for {'C': 0.1, 'class_weight': None, 'penalty': 'l1'}
0.9042140591851484 (+/-0.045047088691114304) for {'C': 0.1, 'class_weight': None, 'penalty': 'l2'}
0.9122407762189787 (+/-0.03847639808073524) for {'C': 0.5, 'class_weight': 'balanced', 'penalty': 'l1'}
0.9443520624741927 (+/-0.03447965624291134) for {'C': 0.5, 'class_we

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.16423241127786756 (+/-0.0638534973785381) for {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l1'}
0.8444718611492508 (+/-0.0597169531272143) for {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l2'}
0.0 (+/-0.0) for {'C': 0.01, 'class_weight': None, 'penalty': 'l1'}
0.04177583898953907 (+/-0.03369633688348938) for {'C': 0.01, 'class_weight': None, 'penalty': 'l2'}
0.7598780533693077 (+/-0.08027055159074782) for {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1'}
0.8618670047597817 (+/-0.045564003515070235) for {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l2'}
0.44595278840280433 (+/-0.1471672397572264) for {'C': 0.1, 'class_weight': None, 'penalty': 'l1'}
0.7023882667189687 (+/-0.10232427623136896) for {'C': 0.1, 'class_weight': None, 'penalty': 'l2'}
0.8620200628452448 (+/-0.06533909328271975) for {'C': 0.5, 'class_weight': 'balanced', 'penalty': 'l1'}
0.867008730494857 (+/-0.04710018152523668) for {'C': 0.5, 'class_weight': 'balanced', 'penalty': 'l2'}
0.784744258

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

0.0 (+/-0.0) for {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l1'}
0.7432166351007441 (+/-0.03981533975073418) for {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l2'}
0.0 (+/-0.0) for {'C': 0.01, 'class_weight': None, 'penalty': 'l1'}
0.0 (+/-0.0) for {'C': 0.01, 'class_weight': None, 'penalty': 'l2'}
0.57179323447559 (+/-0.09786965967860109) for {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1'}
0.7659865115413744 (+/-0.04180706607641902) for {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l2'}
0.09401783416519585 (+/-0.05433722123426959) for {'C': 0.1, 'class_weight': None, 'penalty': 'l1'}
0.4496557612014635 (+/-0.05967529160244656) for {'C': 0.1, 'class_weight': None, 'penalty': 'l2'}
0.7666274327681598 (+/-0.05985255643894461) for {'C': 0.5, 'class_weight': 'balanced', 'penalty': 'l1'}
0.768943242479041 (+/-0.04731834882578984) for {'C': 0.5, 'class_weight': 'balanced', 'penalty': 'l2'}
0.5503597681015249 (+/-0.11864032840072714) for {'C': 0.5, 'class_weight': 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.16423241127786756 (+/-0.0638534973785381) for {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l1'}
0.8301849003528887 (+/-0.047528467493686284) for {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l2'}
0.0 (+/-0.0) for {'C': 0.01, 'class_weight': None, 'penalty': 'l1'}
0.12713367330242437 (+/-0.07001196354661796) for {'C': 0.01, 'class_weight': None, 'penalty': 'l2'}
0.7598780533693077 (+/-0.08027055159074782) for {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1'}
0.8365934646459611 (+/-0.045885291506983256) for {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l2'}
0.44595278840280433 (+/-0.1471672397572264) for {'C': 0.1, 'class_weight': None, 'penalty': 'l1'}
0.7163314931348259 (+/-0.0828728158487465) for {'C': 0.1, 'class_weight': None, 'penalty': 'l2'}
0.8620989704193749 (+/-0.06919683326842865) for {'C': 0.5, 'class_weight': 'balanced', 'penalty': 'l1'}
0.8399737422257336 (+/-0.04317275291980917) for {'C': 0.5, 'class_weight': 'balanced', 'penalty': 'l2'}
0.7886309

Наилучший результат для логрегрессии дает модель 
0.9513184736243894 (+/-0.032912094284879304) for {'C': 100, 'class_weight': 'balanced', 'penalty': 'l2'}
при использовании только униграмм.

Видно, что большинство моделей дает примерно одинаковый результат.

Выводы: нужно пробовать разные способы получать признаки и методы обучения. На других датасетах результаты могут отличаться.