<center><u><H1>Detector de Spam</H1></u></center>

In [1]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [3]:
def preprocessing(text):
    words = word_tokenize(text)
    tokens = [w for w in words if w.lower() not in string.punctuation]
    stopw = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopw]
    tokens = [word for word in tokens if len(word)>=3]    
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text 

In [2]:
sms001 = ['spam', "Had your mobile 11 months or more? U R entitled to update to the latest smartphones for Free! Call today The Smartphone Update Co FREE on 08008000800"]
sms002 = ['ham', "I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, ok? I've cried enough today."]

In [4]:
sms001_p = preprocessing(sms001[1])
sms001_p

'Had mobile months entitled update latest smartphones Free Call today The Smartphone Update FREE 08008000800'

In [5]:
sms002_p = preprocessing(sms002[1])
sms002_p

"gon home soon n't want talk stuff anymore tonight 've cried enough today"

In [6]:
df = pd.read_csv('../data/spam.csv', encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [7]:
df = df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
df = df.rename(columns = {'v1':'Class','v2':'Text'})
df.head(10)

Unnamed: 0,Class,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [8]:
df['Text_p'] = df['Text'].apply(preprocessing)
df.head(10)

Unnamed: 0,Class,Text,Text_p
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy.. Available bugis great wor...
1,ham,Ok lar... Joking wif u oni...,lar ... Joking wif oni ...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry wkly comp win Cup final tkts 21st M...
3,ham,U dun say so early hor... U c already then say...,dun say early hor ... already say ...
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah n't think goes usf lives around though
5,spam,FreeMsg Hey there darling it's been 3 week's n...,FreeMsg Hey darling week word back like fun st...
6,ham,Even my brother is not like to speak with me. ...,Even brother like speak They treat like aids p...
7,ham,As per your request 'Melle Melle (Oru Minnamin...,per request 'Melle Melle Oru Minnaminunginte N...
8,spam,WINNER!! As a valued network customer you have...,WINNER valued network customer selected receiv...
9,spam,Had your mobile 11 months or more? U R entitle...,Had mobile months entitled Update latest colou...


## Convert a collection of text documents to a Tf-idf matrix

In [9]:
X = df['Text_p']
y = df['Class']

In [10]:
X.shape

(5572,)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state= 2019)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
vectorizer = TfidfVectorizer(min_df=0.0025, max_df=0.1, ngram_range=(1, 2),
                         stop_words='english', norm ='l2')

In [14]:
X_std = vectorizer.fit_transform(X)
X_train_v = vectorizer.transform(X_train)
X_test_v = vectorizer.transform(X_test)

In [15]:
print(vectorizer.vocabulary_)

{'crazy': 127, 'available': 50, 'great': 220, 'world': 597, 'got': 218, 'wat': 568, 'lar': 276, 'wif': 582, 'free': 190, 'entry': 171, 'wkly': 589, 'win': 585, 'final': 182, 'text': 503, 'receive': 427, 'question': 417, 'txt': 541, 'rate': 420, 'apply': 44, 'free entry': 191, 'dun': 162, 'say': 444, 'early': 165, 'think': 511, 'goes': 209, 'freemsg': 192, 'hey': 243, 'week': 573, 'word': 593, 'like': 292, 'fun': 201, 'xxx': 604, 'send': 456, '50': 23, 'brother': 77, 'speak': 483, 'treat': 534, 'set': 462, 'friends': 195, 'winner': 586, 'network': 352, 'customer': 130, 'selected': 454, 'prize': 409, 'claim': 103, 'code': 107, 'valid': 552, 'hours': 253, 'mobile': 337, 'update': 547, 'latest': 279, 'colour': 111, 'camera': 86, 'gon': 212, 'home': 248, 'soon': 479, 'want': 564, 'talk': 499, 'stuff': 491, 'tonight': 529, 've': 554, 'today': 522, 'cash': 92, '100': 4, '000': 0, 'pounds': 404, 'cost': 124, '150p': 10, 'day': 138, '16': 13, 'reply': 429, 'info': 263, 'urgent': 548, 'www': 602

In [30]:
len(vectorizer.vocabulary_)

612

In [16]:
print(X_std.shape)
print(X_train_v.shape)
print(X_test_v.shape)

(5572, 612)
(3900, 612)
(1672, 612)


In [17]:
weights = np.asarray(X_train_v.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': vectorizer.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
297,ll,0.017225
292,like,0.01701
113,come,0.016933
272,know,0.016252
519,time,0.01584
218,got,0.015685
248,home,0.014387
214,good,0.014292
564,want,0.013994
211,going,0.013639


## Naive Bayes Classifier

In [18]:
nb_model = MultinomialNB().fit(X_train_v, y_train)

In [19]:
y_pred_nb = nb_model.predict(X_test_v)

In [20]:
print(accuracy_score(y_test, y_pred_nb))

0.9808612440191388


In [21]:
print(confusion_matrix(y_test, y_pred_nb))

[[1446    6]
 [  26  194]]


In [22]:
scores = cross_val_score(nb_model, X_std, y, cv=10)
print(f"Cross Val Scores:{scores}")

Cross Val Scores:[0.98566308 0.97849462 0.97311828 0.97849462 0.96774194 0.97307002
 0.97845601 0.98381295 0.97482014 0.97302158]


In [23]:
print(f"Cross Val Scores Mean:{scores.mean()} / Cross Val Scores Std:{scores.std()}")

Cross Val Scores Mean:0.9766693253377838 / Cross Val Scores Std:0.00513579057166201


## Support Vector Machines

In [24]:
cls_svm = SVC(gamma='scale', kernel='rbf', C=10).fit(X_train_v, y_train)

In [25]:
y_pred_svm = cls_svm.predict(X_test_v)

In [26]:
print(accuracy_score(y_test, y_pred_svm))

0.9832535885167464


In [27]:
print(confusion_matrix(y_test, y_pred_svm))

[[1449    3]
 [  25  195]]


In [28]:
scores = cross_val_score(cls_svm, X_std, y, cv=10)
print(f"Cross Val Scores:{scores}")

Cross Val Scores:[0.9874552  0.97491039 0.97849462 0.98566308 0.96953405 0.97666068
 0.97845601 0.97661871 0.98201439 0.97841727]


In [29]:
print(f"Cross Val Scores Mean:{scores.mean()} / Cross Val Scores Std:{scores.std()}")

Cross Val Scores Mean:0.9788224403971348 / Cross Val Scores Std:0.004929718538649726


## References:

https://www.nltk.org/data.html

http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

http://www.nltk.org/book/ch06.html

http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html