# NLP (Natural Language Processing)

In [35]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [36]:
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('spanish'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/usuario/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Tokenización

* Ejemplo sin tokenizar: La casa es azul y es una casa
* Ejemplo tokenizado: ['la', 'casa', 'es', 'y', 'una', 'azul']

In [38]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize

text = "El perro corre rápido y es un perro"

[nltk_data] Downloading package punkt to /home/usuario/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [39]:

tokens = word_tokenize(text)


Normalización

In [42]:
import re

texto = "El PERRO corre RÁPIDO!!!"
texto_normalizado = re.sub(r'[^\w\s]', '', texto.lower())
print(texto_normalizado)

el perro corre rápido


Eliminar Stop Words

In [43]:
from nltk.corpus import stopwords
nltk.download('stopwords')

texto = "El perro corre rápido"
stop_words = set(stopwords.words('spanish'))
tokens = word_tokenize(texto.lower())
tokens_filtrados = [word for word in tokens if word not in stop_words]
print(tokens_filtrados)

['perro', 'corre', 'rápido']


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/usuario/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Lematización

In [44]:
!python -m spacy download es_core_news_sm

Collecting es-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')


In [45]:
import spacy

nlp = spacy.load('es_core_news_sm')

texto = "El perro está corriendo rápido"

doc = nlp(texto)

tokens_lemantizados = [token.lemma_ for token in doc]
print(tokens_lemantizados)

['el', 'perro', 'estar', 'correr', 'rápido']


Steamming

In [46]:
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('spanish')
texto = "El perro está corriendo rápido"
tokens = word_tokenize(texto.lower())

# Aplicar stemming
tokens_stemmed = [stemmer.stem(token) for token in tokens]
print(tokens_stemmed)

['el', 'perr', 'esta', 'corr', 'rap']


## Representación / Encoding

Bolsa de Palabras

In [47]:
from sklearn.feature_extraction.text import CountVectorizer

textos = ["el perro corre rápido", "el gato corre rápido"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(textos)
print(vectorizer.get_feature_names_out())
print(X.toarray())

['corre' 'el' 'gato' 'perro' 'rápido']
[[1 1 0 1 1]
 [1 1 1 0 1]]


TF-IDF (Term Frequency-Inverse Document Frequency)

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

textos = ["el perro corre rápido", "el gato corre rápido"]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(textos)

print(vectorizer.get_feature_names_out())
print(X.toarray())

['corre' 'el' 'gato' 'perro' 'rápido']
[[0.44832087 0.44832087 0.         0.63009934 0.44832087]
 [0.44832087 0.44832087 0.63009934 0.         0.44832087]]


# The Dataset

In [49]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score


data = pd.read_csv('spam.csv', encoding='latin-1')
data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
data.columns = ['label', 'sms_message']

data

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [50]:
def limpiar_texto(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text

data['message_clean'] = data['sms_message'].apply(limpiar_texto)
data

Unnamed: 0,label,sms_message,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?,will ì_ b going to esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",pity was in mood for that soany other suggest...
5570,ham,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like id ...


In [51]:
# Tokenización
data['tokens'] = data['message_clean'].apply(word_tokenize)
data

Unnamed: 0,label,sms_message,message_clean,tokens
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l..."
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...,"[this, is, the, 2nd, time, we, have, tried, 2,..."
5568,ham,Will Ì_ b going to esplanade fr home?,will ì_ b going to esplanade fr home,"[will, ì_, b, going, to, esplanade, fr, home]"
5569,ham,"Pity, * was in mood for that. So...any other s...",pity was in mood for that soany other suggest...,"[pity, was, in, mood, for, that, soany, other,..."
5570,ham,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like id ...,"[the, guy, did, some, bitching, but, i, acted,..."


In [56]:
# Eliminar stopwords
stop_words = set(stopwords.words('english'))
data['tokens'] = data['tokens'].apply(lambda x: [palabra for palabra in x if palabra not in stop_words])
data

Unnamed: 0,label,sms_message,message_clean,tokens,tokens_lemmatized
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,"[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,"[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, go, usf, life, around, though]"
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...,"[2nd, time, tried, 2, contact, u, u, å750, pou...","[2nd, time, tried, 2, contact, u, u, å750, pou..."
5568,ham,Will Ì_ b going to esplanade fr home?,will ì_ b going to esplanade fr home,"[ì_, b, going, esplanade, fr, home]","[ì_, b, going, esplanade, fr, home]"
5569,ham,"Pity, * was in mood for that. So...any other s...",pity was in mood for that soany other suggest...,"[pity, mood, soany, suggestions]","[pity, mood, soany, suggestion]"
5570,ham,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like id ...,"[guy, bitching, acted, like, id, interested, b...","[guy, bitching, acted, like, id, interested, b..."


In [57]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/usuario/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [59]:
# Lematización
lemmatizer = WordNetLemmatizer()
data['tokens_lemmatized'] = data['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
data

Unnamed: 0,label,sms_message,message_clean,tokens,tokens_lemmatized
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,"[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,"[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, go, usf, life, around, though]"
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...,"[2nd, time, tried, 2, contact, u, u, å750, pou...","[2nd, time, tried, 2, contact, u, u, å750, pou..."
5568,ham,Will Ì_ b going to esplanade fr home?,will ì_ b going to esplanade fr home,"[ì_, b, going, esplanade, fr, home]","[ì_, b, going, esplanade, fr, home]"
5569,ham,"Pity, * was in mood for that. So...any other s...",pity was in mood for that soany other suggest...,"[pity, mood, soany, suggestions]","[pity, mood, soany, suggestion]"
5570,ham,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like id ...,"[guy, bitching, acted, like, id, interested, b...","[guy, bitching, acted, like, id, interested, b..."


In [60]:
# Unir tokens lematizados en una única cadena de texto
data['processed_message'] = data['tokens_lemmatized'].apply(lambda x: ' '.join(x))
data

Unnamed: 0,label,sms_message,message_clean,tokens,tokens_lemmatized,processed_message
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,"[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazy, available, bugis, n...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]",ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin...",free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, early, hor, u, c, already, say]",u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,"[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, go, usf, life, around, though]",nah dont think go usf life around though
...,...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...,"[2nd, time, tried, 2, contact, u, u, å750, pou...","[2nd, time, tried, 2, contact, u, u, å750, pou...",2nd time tried 2 contact u u å750 pound prize ...
5568,ham,Will Ì_ b going to esplanade fr home?,will ì_ b going to esplanade fr home,"[ì_, b, going, esplanade, fr, home]","[ì_, b, going, esplanade, fr, home]",ì_ b going esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",pity was in mood for that soany other suggest...,"[pity, mood, soany, suggestions]","[pity, mood, soany, suggestion]",pity mood soany suggestion
5570,ham,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like id ...,"[guy, bitching, acted, like, id, interested, b...","[guy, bitching, acted, like, id, interested, b...",guy bitching acted like id interested buying s...


In [61]:
# Verificar los datos procesados
#print(data[['label', 'processed_message']])
data

Unnamed: 0,label,sms_message,message_clean,tokens,tokens_lemmatized,processed_message
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,"[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazy, available, bugis, n...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]",ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin...",free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, early, hor, u, c, already, say]",u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,"[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, go, usf, life, around, though]",nah dont think go usf life around though
...,...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...,"[2nd, time, tried, 2, contact, u, u, å750, pou...","[2nd, time, tried, 2, contact, u, u, å750, pou...",2nd time tried 2 contact u u å750 pound prize ...
5568,ham,Will Ì_ b going to esplanade fr home?,will ì_ b going to esplanade fr home,"[ì_, b, going, esplanade, fr, home]","[ì_, b, going, esplanade, fr, home]",ì_ b going esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",pity was in mood for that soany other suggest...,"[pity, mood, soany, suggestions]","[pity, mood, soany, suggestion]",pity mood soany suggestion
5570,ham,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like id ...,"[guy, bitching, acted, like, id, interested, b...","[guy, bitching, acted, like, id, interested, b...",guy bitching acted like id interested buying s...


# Modelo de clasificacion de spam

In [66]:
#bolsa de palabras
vectorizador = CountVectorizer()

#aplicar el vectorizador a nuestro dataset
X = vectorizador.fit_transform(data['processed_message'])


In [70]:
#Dividir los datos en conjuntos de entrenamiento y prueba
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, data['label'], test_size=0.2, random_state=42)

#crear y entrenar el modelo
modelo = MultinomialNB()
modelo.fit(x_train, y_train)

#predecir los resultados
y_pred = modelo.predict(x_test)

#evaluar el modelo
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.99      0.98      0.98       965
        spam       0.90      0.91      0.90       150

    accuracy                           0.97      1115
   macro avg       0.94      0.95      0.94      1115
weighted avg       0.97      0.97      0.97      1115

Accuracy: 0.9739910313901345


In [74]:
#cross-validation
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB

cv_mnb = cross_val_score(MultinomialNB(), X, data['label'], cv=5)
print("Cross-validation scores: ", cv_mnb)

Cross-validation scores:  [0.97488789 0.97757848 0.97486535 0.97307002 0.97127469]


In [75]:
#TF-IDF
vectorizador2 = TfidfVectorizer(stop_words='english')
X = vectorizador2.fit_transform(data['processed_message'])

In [76]:
x_entrenamiento, x_prueba, y_entrenamiento, y_prueba = train_test_split(X, data['label'], test_size=0.2, random_state=42)

In [77]:
modelo2 = MultinomialNB()
modelo2.fit(x_entrenamiento, y_entrenamiento)

In [78]:
y_prediccion = modelo2.predict(x_prueba)

In [79]:
print(classification_report(y_prueba, y_prediccion))
print("accuracy_score: ", accuracy_score(y_prueba, y_prediccion))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.73      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115

accuracy_score:  0.9632286995515695


In [82]:
#Regresion logistica
vectorizador3 = CountVectorizer(stop_words='english', max_features=1000)

x = vectorizador3.fit_transform(data['processed_message'])


In [91]:
x_train3, x_test3, y_train3, y_test3 = train_test_split(x, data['label'], test_size=0.2, random_state=42)

In [92]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=16)
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

modelo3 = LogisticRegression()
modelo3.fit(x_train3, y_train3)



In [93]:
y_pred3 = modelo3.predict(x_test3)
accuracy_score(y_test3, y_pred3)

0.9757847533632287

In [94]:
print(classification_report(y_test3, y_pred3))
print("accuracy_score: ", accuracy_score(y_test3, y_pred3))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       965
        spam       0.99      0.83      0.90       150

    accuracy                           0.98      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.98      0.98      0.97      1115

accuracy_score:  0.9757847533632287
