In [22]:
import pandas as pd
import spacy
import numpy as np
import re

In [6]:
strSpamSource = 'es_spam.csv'
spamDB = pd.read_csv(strSpamSource, sep=',', names=['label', 'message'])
spamDB = spamDB.iloc[1:]

In [7]:
spamDB.head()

Unnamed: 0,label,message
1,ham,"Ir hasta el punto de jurong, loco .. Disponibl..."
2,ham,lar bien ... Bromas WIF u oni ...
3,spam,Entrada libre en una imagen de obsequio 2 wkly...
4,ham,T Dun decir hor tan temprano ... t r ya contin...
5,ham,"Nah no creo que vaya a USF, que vive por aquí,..."


In [9]:
spamDB.iloc[2, :].message

'Entrada libre en una imagen de obsequio 2 wkly para ganar la Copa FA tkts finales 21o de mayo de 2005. El texto FA a 87121 para recibir la pregunta de entrada (tasa txt std) T y C se aplican 08452810075over18 de'

In [11]:
nlp = spacy.load("es_core_news_sm")

In [12]:
spamDB.shape

(5572, 2)

In [13]:
spamDB.label.value_counts()/len(spamDB)

ham     0.865937
spam    0.134063
Name: label, dtype: float64

In [14]:
ham = spamDB[spamDB['label'] == 'ham']
spam = spamDB[spamDB['label'] == 'spam']
ham.shape, spam.shape

((4825, 2), (747, 2))

In [16]:
ham = ham.sample(2*spam.shape[0])
ham.shape, spam.shape

((1494, 2), (747, 2))

In [18]:
dataset = ham.append(spam, ignore_index=True)
dataset.shape

(2241, 2)

In [20]:
dataset.head()

Unnamed: 0,label,message
0,ham,Por lo que significa que todavía piensan de teju
1,ham,Mamá no va Robinson ya.
2,ham,Resolver d Caso: Un hombre fue encontrado ases...
3,ham,"Espere que todavía no es tan clara, que no est..."
4,ham,Boo estoy en mi camino a mi mamá. Ella está ha...


# Normalización de Texto

In [21]:
corpus = dataset.message
corpus

0        Por lo que significa que todavía piensan de teju
1                                 Mamá no va Robinson ya.
2       Resolver d Caso: Un hombre fue encontrado ases...
3       Espere que todavía no es tan clara, que no est...
4       Boo estoy en mi camino a mi mamá. Ella está ha...
                              ...                        
2236    Quiero sexo explícito en 30 segundos? Anillo 0...
2237    PREGUNTADO SI 3Mobile 0870 canales de conversa...
2238    Tenía su contrato móvil 11 mnths? Última Motor...
2239    RECORDATORIO DE O2: Para obtener 2,50 libras e...
2240    Esta es la segunda vez que hemos intentado 2 d...
Name: message, Length: 2241, dtype: object

In [24]:
def Normalizacion(corpus):
    newCorpus = []
    for doc in corpus:
        newCorpus.append(re.sub(r'[^a-zA-Z0-9\s{1}áéíóúüñÁÉÍÓÚ]', '', doc).lower().strip().rstrip('\n').rstrip('\r\n'))
    return newCorpus

In [None]:
corpus = Normalizacion(corpus)
#corpus

# Tokenización

In [26]:
def tokenizacion(corpus):
    newCorpus = []
    for doc in corpus:
        newCorpus.append(nlp(doc))
    return newCorpus

In [27]:
corpus = tokenizacion(corpus)

# Revomer Stopwords

In [28]:
def removeStops(corpus):
    newCorpus = []
    for doc in corpus:
        s = ""
        for token in doc:
            if(token.is_stop == False):
                s = s + token.text + " "
        newCorpus.append(s.strip())
    return newCorpus   

In [None]:
corpus = removeStops(corpus)
#corpus

# Stemming y Lemmatización

In [30]:
def stemmingLemmating(corpus):
    documents = tokenizacion(corpus)
    newCorpus = []
    for doc in documents:
        s = ""
        for token in doc:
            s = s + token.lemma_ + " "
        newCorpus.append(s.strip())
    return newCorpus

In [None]:
corpus = stemmingLemmating(corpus)
#corpus

# Construcción de Matriz TF.

In [33]:
def listToString(s):
    strX = ""
    for palabra in s:
        strX = strX + palabra + " "
    return strX

In [None]:
strCorpus = listToString(corpus)
#strCorpus

In [35]:
strCorpus = strCorpus.strip().split(' ')

In [None]:
strCorpus

In [37]:
strCorpus = set(strCorpus)

In [38]:
len(strCorpus)

5887

In [39]:
corpusCols = list(strCorpus)
corpusRows = range(0, len(corpus))

In [40]:
def generateEmptyTF(cols, rows):
    outDf = pd.DataFrame(index=rows, columns=cols)
    outDf = outDf.fillna(0)
    return outDf

In [42]:
tfCorpus = generateEmptyTF(corpusCols, corpusRows)
tfCorpus

Unnamed: 0,Unnamed: 1,genuino,accommodationvouchers,6zf,welp,descarado,greatbhaji,mudo,goma,32323,...,amnow,fifa,gastroenteritis,eh,kisi,rojo,representante,desordenado,salgo,yaxxx
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2236,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2237,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2239,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
def calculoTF(corpus, df):
    corpus = tokenizacion(corpus)
    for index, doc in enumerate(corpus):
        docLen = len(doc)
        for word in doc:
            try:
                colIndex = list(df.columns).index(word.text)
                df.iloc[index, colIndex] =  df.iloc[index, colIndex] + 1
            except:
                pass
        df.iloc[index, :] = df.iloc[index, :] / docLen
    return df   

In [44]:
tfMatrix = calculoTF(corpus, tfCorpus)
tfMatrix

Unnamed: 0,Unnamed: 1,genuino,accommodationvouchers,6zf,welp,descarado,greatbhaji,mudo,goma,32323,...,amnow,fifa,gastroenteritis,eh,kisi,rojo,representante,desordenado,salgo,yaxxx
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Calculo de IDF

In [45]:
def calculoIDF(df):
    N = df.shape[0]
    valX = (N/df.astype(bool).sum(axis=0))
    idfValues = pd.Series(np.log(valX))
    return idfValues    

In [49]:
corpusIDF = calculoIDF(tfMatrix)
corpusIDF

                         5.635236
genuino                  5.316782
accommodationvouchers    5.517453
6zf                      5.517453
welp                     5.517453
                           ...   
rojo                     5.316782
representante            4.824306
desordenado              5.517453
salgo                    5.517453
yaxxx                    5.517453
Length: 5887, dtype: float64

# Calculo de Matriz TF-IDF

In [50]:
tfidfMatrix = tfMatrix.mul(corpusIDF, axis=1)
tfidfMatrix = tfidfMatrix.fillna(0)
tfidfMatrix

Unnamed: 0,Unnamed: 1,genuino,accommodationvouchers,6zf,welp,descarado,greatbhaji,mudo,goma,32323,...,amnow,fifa,gastroenteritis,eh,kisi,rojo,representante,desordenado,salgo,yaxxx
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Entrenamiento

In [62]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfidfMatrix, dataset['label'], test_size=0.3, random_state=2021, shuffle=True)

In [63]:
X_train.shape

(1568, 5887)

In [64]:
X_test.shape

(673, 5887)

In [65]:
def encode(x):
    if(x == "ham"):
        return 0
    else:
        return 1

In [66]:
y_train = list(map(encode, y_train))
y_train[0:10]

[0, 1, 1, 0, 0, 0, 1, 1, 0, 1]

# Random Forest

In [67]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=15, random_state=0)
rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=15, random_state=0)

In [68]:
y_preds_rfc = rfc.predict(X_test)
y_preds_rfc

array([1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [69]:
from sklearn.metrics import classification_report

In [71]:
y_test = list(map(encode, y_test))

In [72]:
print(classification_report(y_test, y_preds_rfc))

              precision    recall  f1-score   support

           0       0.85      1.00      0.92       443
           1       1.00      0.65      0.79       230

    accuracy                           0.88       673
   macro avg       0.92      0.82      0.85       673
weighted avg       0.90      0.88      0.87       673



In [73]:
from sklearn.svm import SVC

svmc = SVC(kernel='linear')
svmc.fit(X_train, y_train)

SVC(kernel='linear')

In [74]:
y_preds_svm = svmc.predict(X_test)
y_preds_svm

array([1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,

In [75]:
print(classification_report(y_test, y_preds_svm))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96       443
           1       0.91      0.94      0.93       230

    accuracy                           0.95       673
   macro avg       0.94      0.95      0.94       673
weighted avg       0.95      0.95      0.95       673

