# Text analysis and spam filtering

In [42]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

In [209]:
def save_txt(a, fname):
    with open('{}.txt'.format(fname), 'w') as f:
        try:
            for elem in a:
                f.write(str(elem)+ ' ')
        except:
            f.write(str(a))

In [210]:
def get_pipeline(vectorizer, clf):
    return Pipeline(steps=[
    ('vectorizer', vectorizer), 
    ('classifier', clf)
    ])

### Read and prepare data

In [211]:
texts = pd.read_table('SMSSpamCollection.txt', header=None)
labels = map(lambda x: 1 if x == 'spam' else 0, texts[0])
texts = list(texts.drop(columns=0)[1])

In [212]:
from sklearn.feature_extraction.text import CountVectorizer 
vectorizer = CountVectorizer()
texts_features = vectorizer.fit_transform(texts)

In [213]:
texts_features

<5572x8713 sparse matrix of type '<type 'numpy.int64'>'
	with 74169 stored elements in Compressed Sparse Row format>

### Make a classification

In [214]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
np.random.seed(2)
clf = LogisticRegression()
score = cross_val_score(clf, X=texts_features, y=labels, cv=10, 
                        scoring='f1')

In [216]:
print np.round(score.mean(), decimals=1)
save_txt(np.round(score.mean(), 1), 'a1')

0.9


Try to make pipeline

In [217]:
from sklearn.pipeline import Pipeline
score = cross_val_score(get_pipeline(CountVectorizer(), LogisticRegression()), 
                        X=texts, y=labels, cv=10, scoring='f1')
print np.round(score.mean(), decimals=1)

0.9


### Make a prediction

In [218]:
test = ["FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB", 
"FreeMsg: Txt: claim your reward of 3 hours talk time", "Have you visited the last lecture on physics?", 
"Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$", "Only 99$"]

In [224]:
clf.fit(texts_features, labels)
test_features = vectorizer.transform(test)
predicts = clf.predict(test_features)
print predicts
save_txt(predicts, 'a2')

[1 1 0 0 0]


### Use ngrams

In [222]:
ranges = [(2,2), (3,3), (1,3)]

In [226]:
ngram_results = []
for cur_range in ranges:
    score = cross_val_score(get_pipeline(CountVectorizer(ngram_range=cur_range), 
                                         LogisticRegression()), 
                            X=texts, y=labels, cv=10, scoring='f1')
    ngram_results.append(np.round(score.mean(), decimals=2))

In [228]:
print ngram_results
save_txt(ngram_results, 'a3')

[0.81999999999999995, 0.72999999999999998, 0.93000000000000005]


### Use naive Bayes and ngrams

In [232]:
from sklearn.naive_bayes import MultinomialNB
ngram_results = []
for cur_range in ranges:
    texts_features = \
        CountVectorizer(ngram_range=cur_range).fit_transform(texts)
    clf = MultinomialNB()
    score = cross_val_score(clf, X=texts_features, y=labels, 
                            cv=10, scoring='f1')
    ngram_results.append(np.round(score.mean(), decimals=2))

In [233]:
print ngram_results
save_txt(ngram_results, 'a4')

[0.65000000000000002, 0.38, 0.89000000000000001]


### LogReg again and TF-IDF

In [243]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [244]:
count_vect_score = cross_val_score(get_pipeline(CountVectorizer(), LogisticRegression()), 
                        X=texts, y=labels, cv=10, scoring='f1').mean()

In [246]:
tfidf_score = cross_val_score(get_pipeline(TfidfVectorizer(), LogisticRegression()), 
                        X=texts, y=labels, cv=10, scoring='f1').mean()

In [248]:
print np.around(count_vect_score, 3), np.around(tfidf_score, 3)

0.933 0.879


In [250]:
save_txt(-1, 'a5')