In [1]:
import pandas as pd
import spacy 
import re
import spacy.cli

In [2]:
spamDB = pd.read_csv('es_spam.csv')
spamDB.head()

Unnamed: 0,etiqueta,mensaje
0,ham,"Ir hasta el punto de jurong, loco .. Disponibl..."
1,ham,lar bien ... Bromas WIF u oni ...
2,spam,Entrada libre en una imagen de obsequio 2 wkly...
3,ham,T Dun decir hor tan temprano ... t r ya contin...
4,ham,"Nah no creo que vaya a USF, que vive por aquí,..."


In [3]:
spamDB.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   etiqueta  5572 non-null   object
 1   mensaje   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
spacy.cli.download('es_core_news_sm')

Collecting es-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.4.0/es_core_news_sm-3.4.0-py3-none-any.whl (12.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.9/12.9 MB 1.9 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')


You should consider upgrading via the '/home/dennis/.cache/pypoetry/virtualenvs/pythondata-ElkPyzMM-py3.10/bin/python -m pip install --upgrade pip' command.


In [5]:
nlp = spacy.load('es_core_news_sm')

In [9]:
spamDB['etiqueta'].value_counts()

ham     4825
spam     747
Name: etiqueta, dtype: int64

In [10]:
ham = spamDB[spamDB['etiqueta'] == 'ham']
spam = spamDB[spamDB['etiqueta'] == 'spam']

In [11]:
ham_sampled = ham.sample(2 * spam.shape[0], random_state=2022)
ham_sampled.shape

(1494, 2)

In [17]:
dataset = pd.concat([spam, ham_sampled], ignore_index=True)

## 1 Normalización de texto

In [19]:
def normalize_text(corpus):
    newCorpus = []
    for doc in corpus:
        newCorpus.append(re.sub('[^a-zA-Z0-9\s{1}áéíóúüñÁÉÍÓÚÑ]', '', doc)\
            .lower().strip().rstrip('\n').rstrip('\r\n'))
    return newCorpus

In [21]:
corpus = dataset['mensaje']
corpus_1 = normalize_text(corpus)
corpus_1

['entrada libre en una imagen de obsequio 2 wkly para ganar la copa fa tkts finales 21o de mayo de 2005 el texto fa a 87121 para recibir la pregunta de entrada tasa txt std t y c se aplican 08452810075over18 de',
 'freemsg hola querida que ha sido 3 semanas de ahora y ninguna palabra de vuelta me gustaría un poco de diversión que compensa todavía tb bien xxx std chgs para enviar   150 a vn',
 'ganador como cliente de red valioso que ha sido seleccionado para receivea   900 recompensa premio para reclamar llamada 09061701461 código de reclamo kl341 sólo es válido 12 horas',
 'tenido sus teléfonos 11 meses o más t r derecho a actualizar a la última móviles de color con la cámara de forma gratuita el móvil llame actualización gratuita co en 08002986030',
 'seis oportunidades para ganar dinero en efectivo de 100 a 20000 libras txt csh11 y enviar a 87575 costo 150p  día 6 días 16  tsandcs aplican responder hl 4 info',
 'urgente has ganado un 1 semana membresía gratis en nuestra   100000 jac

## 2 Tokenizacion

In [22]:
def tokenization(corpus):
    newCorpus = []
    for doc in corpus:
        newCorpus.append(nlp(doc))
    return newCorpus

corpus_2 = tokenization(corpus_1)

In [26]:
corpus_2[0] 

entrada libre en una imagen de obsequio 2 wkly para ganar la copa fa tkts finales 21o de mayo de 2005 el texto fa a 87121 para recibir la pregunta de entrada tasa txt std t y c se aplican 08452810075over18 de

## 3. Remover Stopwords

In [31]:
def removeStops(corpus):
    newCorpus = []
    for doc in corpus:
        newCorpus.append(' '.join([token.text for token in doc if not token.is_stop]))
    return newCorpus

corpus_3 = removeStops(corpus_2)

In [34]:
print(corpus_2[1])
print(corpus_3[1])

freemsg hola querida que ha sido 3 semanas de ahora y ninguna palabra de vuelta me gustaría un poco de diversión que compensa todavía tb bien xxx std chgs para enviar   150 a vn
freemsg hola querida 3 semanas palabra vuelta gustaría diversión compensa tb xxx std chgs enviar    150 vn


## 4. Stemming y Lemmatization

In [36]:
def stemming_lemmatization(corpus):
    docs = tokenization(corpus)
    newCorpus = []
    for doc in docs:
        newCorpus.append(' '.join([token.lemma_ for token in doc]))
    return newCorpus

corpus_4 = stemming_lemmatization(corpus_3)

In [39]:
corpus_4[1]

'freemsg holar querido 3 semana palabrir vuelta gustar diversión compensar tb xxx std chg enviar     150 vn'

In [41]:
def corpus2string(corpus):
    newCorpus = ' '.join(corpus)
    return newCorpus

corpus_5 = corpus2string(corpus_4)

In [43]:
corpus_5[:1000]

'entrada libre imagen obsequio 2 wkly ganar copa fa tkts final 21o mayo 2005 texto fa 87121 recibir pregunta entrado tasa txt std t c aplicar 08452810075over18 freemsg holar querido 3 semana palabrir vuelta gustar diversión compensar tb xxx std chg enviar     150 vn ganador cliente red valioso seleccionado receivea     900 recompensa premio reclamar llamado 09061701461 código reclamar kl341 válido 12 hora teléfonos 11 mes t r derecho actualizar móvil color cámaro forma gratuito móvil llame actualización gratuito co 08002986030 oportunidad ganar dinero efectivo 100 20000 libras txt csh11 enviar 87575 costo 150p    6 16    tsandcs aplicar responder hl 4 info urgente haber ganar 1 semana membresía gratis     100000 jackpot txt palabro pretensión 81010 t    c wwwdbuknet lccltd pobox 4403ldnw1a7rw18 xxxmobilemovieclub tarjetir crédito hacer clic enlace wap mensaje texto hacer clic    http    wap xxxmobilemovieclubcomnqjkgighjjgcbl inglaterra v macedonio    miss dont meta    equipo noticia t

In [46]:
corpus_6 = set(corpus_5.split(' '))
len(corpus_6)

5633

In [47]:
ncol = len(corpus_6)
nrow = len(corpus_4)

nrow, ncol

(2241, 5633)

In [49]:
def generateEmptyMatrix(cols, rows):
    outDf = pd.DataFrame(0, index=range(rows), columns=cols)
    return outDf

nrow = len(corpus_4)
tfMatrix = generateEmptyMatrix(list(corpus_6), nrow)
tfMatrix

Unnamed: 0,Unnamed: 1,portado,ll,brote,wwwtxt82228com,constantemente,4evo,sorprendido,salto,2006,invitación,87575,medicina,131004,4d,consiga,wwwgetzedcouk,laboratorio,wesley,desperté,copiar,pg,74355,conacted,mobno,er,mk45,cuestir,reto,4403ldnw1a7rw18,havnt,cud,encontrarno,ejercicio,864233,precipitar,barco,shinco,chikkuk,glorioso,...,730,07734396839,69200,conducir,suscripciones,gt,vegetarián,4txt,complete,gracias,fiebrir,neville,permanecer,cámar,iíd,7ish,09061790126,pude,hyde,portátil,guoyang,wanna,knock,lanzar,bajo,morefrmmob,applebees,2lands,010803,cps,westshore,4882,smg,rodger,comedia,amnow,08081560665,88877,úharry,rosa
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2236,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2237,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2239,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [50]:
def calculoTf(corpus, df):
    corpus = tokenization(corpus)
    for index, doc in enumerate(corpus):
        docSize = len(doc)
        for token in doc:
            try:
                colIndex = list(df.columns).index(token.text)
                df.iloc[index, colIndex] = df.iloc[index, colIndex] + 1
            except:
                pass
        df.iloc[index, :] = df.iloc[index, :] / docSize
    return df

tfMatrix1 = calculoTf(corpus_4, tfMatrix)

In [69]:
tfMatrix.to_csv('tfMatrix.csv')

## IDF

In [51]:
import numpy as np

In [52]:
def calculoIDF(df):
    N = df.shape[0]
    valX = (N/(df.astype(bool).sum(axis=0)))
    idfValues = pd.Series(np.log(valX))
    return idfValues

idfVector = calculoIDF(tfMatrix1)

In [58]:
tfIdMatrix = tfMatrix1.multiply(idfVector, axis=1)
tfIdMatrix.fillna(0, inplace=True)

tfIdMatrix[tfIdMatrix['saludo'] > 0]

Unnamed: 0,Unnamed: 1,portado,ll,brote,wwwtxt82228com,constantemente,4evo,sorprendido,salto,2006,invitación,87575,medicina,131004,4d,consiga,wwwgetzedcouk,laboratorio,wesley,desperté,copiar,pg,74355,conacted,mobno,er,mk45,cuestir,reto,4403ldnw1a7rw18,havnt,cud,encontrarno,ejercicio,864233,precipitar,barco,shinco,chikkuk,glorioso,...,730,07734396839,69200,conducir,suscripciones,gt,vegetarián,4txt,complete,gracias,fiebrir,neville,permanecer,cámar,iíd,7ish,09061790126,pude,hyde,portátil,guoyang,wanna,knock,lanzar,bajo,morefrmmob,applebees,2lands,010803,cps,westshore,4882,smg,rodger,comedia,amnow,08081560665,88877,úharry,rosa
468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
934,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Machine Learning

In [66]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    tfIdMatrix, dataset['etiqueta'], test_size=0.3, random_state=2022, shuffle=True, stratify=dataset['etiqueta'])

In [60]:
def encode(y):
    if(y == 'ham'):
        return 0
    return 1

In [64]:
y_train = list(map(encode, y_train))

### Random Forest

In [67]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=15, n_estimators=100, random_state=2022)
rfc.fit(X_train, y_train)

print(rfc.score(X_test, y_test))

0.8945022288261516


In [68]:
from sklearn.metrics import classification_report

print(classification_report(y_test, rfc.predict(X_test)))


              precision    recall  f1-score   support

         ham       0.86      1.00      0.93       449
        spam       1.00      0.68      0.81       224

    accuracy                           0.89       673
   macro avg       0.93      0.84      0.87       673
weighted avg       0.91      0.89      0.89       673

