# Spam Text Classification

# *Classificador de Spam*

In [81]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
import unicodedata
import re
import spacy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


from sklearn.model_selection import GroupKFold , RandomizedSearchCV , KFold , cross_validate, cross_val_score, train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

from scipy.stats import randint
from spacy.lang.en.stop_words import STOP_WORDS as stopwords


### Load Spam Dataset
### *Carregando o Banco de Dados*



In [82]:
df = pd.read_csv('spam.csv', encoding='latin-1')
del df['Unnamed: 2']
del df['Unnamed: 3']
del df['Unnamed: 4']
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [83]:
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [84]:
df['v2'].value_counts()

Sorry, I'll call later                                                                                                                                                 30
I cant pick the phone right now. Pls send a message                                                                                                                    12
Ok...                                                                                                                                                                  10
7 wonders in My WORLD 7th You 6th Ur style 5th Ur smile 4th Ur Personality 3rd Ur Nature 2nd Ur SMS and 1st \Ur Lovely Friendship\"... good morning dear"               4
Say this slowly.? GOD,I LOVE YOU &amp; I NEED YOU,CLEAN MY HEART WITH YOUR BLOOD.Send this to Ten special people &amp; u c miracle tomorrow, do it,pls,pls do it...     4
                                                                                                                                                      

### Balance Dataset
### *Balanceando o Banco de Dados*


In [85]:
ham = df[df['v1']=='ham']
ham.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
6,ham,Even my brother is not like to speak with me. ...


In [86]:
spam = df[df['v1']=='spam']
spam.head()

Unnamed: 0,v1,v2
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."


In [87]:
ham.shape, spam.shape

((4825, 2), (747, 2))

In [88]:
ham = ham.sample(spam.shape[0])

In [89]:
ham.shape, spam.shape

((747, 2), (747, 2))

In [90]:
data = ham.append(spam, ignore_index=True)

In [91]:
data.sample(5)

Unnamed: 0,v1,v2
1466,spam,Camera - You are awarded a SiPix Digital Camer...
573,ham,"Lets use it next week, princess :)"
93,ham,That is wondar full flim.
728,ham,Anything lor... U decide...
431,ham,Im just wondering what your doing right now?


In [92]:
data.shape

(1494, 2)

In [93]:
data['v1'].value_counts()

ham     747
spam    747
Name: v1, dtype: int64

### Pré Processamento NLP

In [94]:
#functions for process
def cont_to_exp(x):
    if type(x) is str:
        for key in contractions:
            value = contractions[key]
            x = x.replace(key, value)
        return x
    else:
        return x


def make_to_base(x):
    x = str(x)
    x_list = []
    doc = nlp(x)
    
    for token in doc:
        lemma = token.lemma_
        if lemma == '-PRON-' or lemma == 'be':
            lemma = token.text

        x_list.append(lemma)
    return ' '.join(x_list)


def remove_accented_chars(x):
    x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return x


nlp = spacy.load('en_core_web_sm')



In [95]:
#Descri_ process

df['num_palavras'] = df['v2'].apply(lambda x: len(str(x).split()))

#contracoes

contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how does",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    # "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    " u ": " you ",
    " ur ": " your ",
    " n ": " and ",
    "won't": "would not",
    'dis': 'this',
    'bak': 'back',
    'brng': 'bring'
}

# -----

df['v2'] = df['v2'].apply(lambda x: cont_to_exp(x))
df.dtypes
#removendo chracter special
df['v2'] = df['v2'].apply(lambda x: re.sub(r'[^\w ]+', "", x))

#deixando tudo em lower case
df['v2'] = df['v2'].apply(lambda x: str(x).lower())

#removendo espacos multiplos
df['v2'] = df['v2'].apply(lambda x: ' '.join(x.split())) 
#esse joint irá juntar as palavras com espaco simples

#removendo os acentos
df['v2'] = df['v2'].apply(lambda x: remove_accented_chars(x))

#removendo as stopwords
df['v2'] = df['v2'].apply(lambda x: ' '.join([t for t in x.split() if t not in stopwords]))

#transformando na raiz da palavra
df['v2'] = df['v2'].apply(lambda x: make_to_base(x))

### Definindo as Variáveis e os Modelos

In [107]:
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(df['v2'])

y = df['v1']

def imprime_score(scores):
  media = scores.mean() * 100
  desvio = scores.std() * 100
  print("Accuracy médio %.2f" % media)
  print("Intervalo [%.2f, %.2f]" % (media - 2 * desvio, media + 2 * desvio))


### Modelo Dummy


In [98]:
SEED = 301
np.random.seed(SEED)

modelo = DummyClassifier()
results = cross_validate(modelo, x, y, cv = 10, return_train_score=False)
media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()
print("Accuracy com dummy stratified, 10 = [%.2f, %.2f]" % ((media - 2 * desvio_padrao)*100, (media + 2 * desvio_padrao) * 100))

Accuracy com dummy stratified, 10 = [86.43, 86.75]


### Decision Tree

In [105]:
SEED=301
np.random.seed(SEED)

espaco_de_parametros = {
    "max_depth" : [20, 30, 40 ],
    "min_samples_split" : [8, 16, 32],
    "min_samples_leaf" : [2, 4 ],
    "criterion" : ["gini", "entropy"]
}

busca = RandomizedSearchCV(DecisionTreeClassifier(),
                    espaco_de_parametros,
                    n_iter = 8,
                    cv = KFold(n_splits = 5, shuffle=True),
                    random_state = SEED)
busca.fit(x, y)
resultados = pd.DataFrame(busca.cv_results_)
resultados.head()

resultados_ordenados_pela_media = resultados.sort_values("mean_test_score", ascending=False)
for indice, linha in resultados_ordenados_pela_media.iterrows():
  print("%.3f +-(%.3f) %s" % (linha.mean_test_score, linha.std_test_score*2, linha.params))


0.961 +-(0.011) {'min_samples_split': 16, 'min_samples_leaf': 2, 'max_depth': 30, 'criterion': 'gini'}
0.961 +-(0.011) {'min_samples_split': 16, 'min_samples_leaf': 2, 'max_depth': 20, 'criterion': 'gini'}
0.960 +-(0.013) {'min_samples_split': 8, 'min_samples_leaf': 2, 'max_depth': 40, 'criterion': 'entropy'}
0.960 +-(0.008) {'min_samples_split': 16, 'min_samples_leaf': 2, 'max_depth': 40, 'criterion': 'entropy'}
0.959 +-(0.009) {'min_samples_split': 32, 'min_samples_leaf': 4, 'max_depth': 40, 'criterion': 'gini'}
0.955 +-(0.010) {'min_samples_split': 8, 'min_samples_leaf': 4, 'max_depth': 20, 'criterion': 'entropy'}
0.955 +-(0.011) {'min_samples_split': 8, 'min_samples_leaf': 4, 'max_depth': 30, 'criterion': 'entropy'}
0.955 +-(0.010) {'min_samples_split': 32, 'min_samples_leaf': 4, 'max_depth': 40, 'criterion': 'entropy'}


In [108]:
scores = cross_val_score(busca, x, y, cv = KFold(n_splits=5, shuffle=True))
imprime_score(scores)

Accuracy médio 95.93
Intervalo [95.17, 96.68]


In [109]:
melhor = busca.best_estimator_
print(melhor)

DecisionTreeClassifier(max_depth=30, min_samples_leaf=2, min_samples_split=16)


### Random Forest

In [110]:
SEED=301
np.random.seed(SEED)

espaco_de_parametros = {
    "n_estimators" : [10, 50],
    "max_depth" : [20,30,40],
    "min_samples_split" : [8, 16, 32],
    "min_samples_leaf" : [2, 4, 8],
    "bootstrap" : [True, False],
    "criterion" : ["gini", "entropy"]
}

busca = RandomizedSearchCV(RandomForestClassifier(),
                    espaco_de_parametros,
                    n_iter = 8,
                    cv = KFold(n_splits = 5, shuffle=True))
busca.fit(x, y)

resultados = pd.DataFrame(busca.cv_results_)
resultados.head()

resultados_ordenados_pela_media = resultados.sort_values("mean_test_score", ascending=False)
for indice, linha in resultados_ordenados_pela_media[:5].iterrows():
  print("%.3f +-(%.3f) %s" % (linha.mean_test_score, linha.std_test_score*2, linha.params))


0.959 +-(0.012) {'n_estimators': 10, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_depth': 40, 'criterion': 'entropy', 'bootstrap': False}
0.950 +-(0.011) {'n_estimators': 50, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_depth': 30, 'criterion': 'gini', 'bootstrap': False}
0.949 +-(0.009) {'n_estimators': 50, 'min_samples_split': 32, 'min_samples_leaf': 4, 'max_depth': 30, 'criterion': 'gini', 'bootstrap': True}
0.931 +-(0.011) {'n_estimators': 10, 'min_samples_split': 8, 'min_samples_leaf': 8, 'max_depth': 40, 'criterion': 'entropy', 'bootstrap': True}
0.930 +-(0.011) {'n_estimators': 50, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_depth': 20, 'criterion': 'entropy', 'bootstrap': True}


In [111]:
scores = cross_val_score(busca, x, y, cv = KFold(n_splits=5, shuffle=True))
imprime_score(scores)

Accuracy médio 95.62
Intervalo [93.75, 97.49]


In [112]:
melhor = busca.best_estimator_
print(melhor)

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=40,
                       min_samples_leaf=2, min_samples_split=8,
                       n_estimators=10)


### Maneira Mais simples, mas encontrando otimos resultados

In [119]:
df.head()

Unnamed: 0,v1,v2,num_palavras
0,ham,jurong point crazy available bugis great world...,20
1,ham,ok lar joke wif oni,6
2,spam,free entry 2 wkly comp win fa cup final tkts 2...,28
3,ham,u dun early hor u c,11
4,ham,nah think go usf life,13


In [122]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['v2'])
X = X.toarray()
X_train, X_test, y_train, y_test = train_test_split(X, df['v1'], test_size = 0.2, random_state = 301, stratify = df['v1'])

In [123]:
X_train.shape, X_test.shape

((4457, 8290), (1115, 8290))

### Random Forest

In [124]:
clf = RandomForestClassifier(n_estimators=100, n_jobs= -1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [125]:
confusion_matrix(y_test, y_pred)

array([[965,   1],
       [ 26, 123]])

In [126]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       966
        spam       0.99      0.83      0.90       149

    accuracy                           0.98      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.98      0.98      0.97      1115



### SVM 

In [127]:
clf = SVC(C = 1000, gamma = 'auto')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [128]:
confusion_matrix(y_test, y_pred)

array([[965,   1],
       [ 42, 107]])

In [129]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       0.99      0.72      0.83       149

    accuracy                           0.96      1115
   macro avg       0.97      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



### Testando o Modelo

In [59]:
clf

SVC(C=1000, gamma='auto')

In [60]:
def predict(x):
    x = tfidf.transform([x])
    x = x.toarray()
    pred = clf.predict(x)
    return pred

In [61]:
predict('Hey, how are you, man? Hope to see you soon next year.')

array(['ham'], dtype=object)

In [62]:
predict('you have got free tickets.')

array(['spam'], dtype=object)