<a href="https://colab.research.google.com/github/gzsomm/Lenguaje-natural/blob/main/Lenguaje_natural.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Cargar el dataset desde URL
url = 'https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv'
df = pd.read_csv(url)

# Ver los primeros registros
print(df.head())


                                                 url  is_spam
0  https://briefingday.us8.list-manage.com/unsubs...     True
1                             https://www.hvper.com/     True
2                 https://briefingday.com/m/v4n3i4f3     True
3   https://briefingday.com/n/20200618/m#commentform    False
4                        https://briefingday.com/fan     True


In [4]:
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Instanciar lematizador y stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Función para limpiar y procesar URLs
def preprocess_url(url):
    # Separar por signos y eliminar caracteres no alfabéticos
    tokens = re.split(r'\W+', url.lower())
    # Lematizar y filtrar stopwords
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token and token not in stop_words]
    return ' '.join(tokens)

# Aplicar el procesamiento
df['processed_url'] = df['url'].apply(preprocess_url)

# Dividir en X (entradas) y y (etiquetas)
X = df['processed_url']
y = df['is_spam']  # 0 = no spam, 1 = spam

# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convertir a vectores TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Crear y entrenar modelo SVM
svm = SVC()
svm.fit(X_train_tfidf, y_train)

# Predicción y evaluación
y_pred = svm.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       False       0.96      0.97      0.97       455
        True       0.91      0.88      0.89       145

    accuracy                           0.95       600
   macro avg       0.93      0.92      0.93       600
weighted avg       0.95      0.95      0.95       600



In [6]:
from sklearn.model_selection import GridSearchCV

# Definir grilla de hiperparámetros
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Grid search con validación cruzada
grid = GridSearchCV(SVC(), param_grid, cv=5, verbose=1, n_jobs=-1)
grid.fit(X_train_tfidf, y_train)

# Mejor modelo
best_model = grid.best_estimator_
print("Mejores hiperparámetros:", grid.best_params_)

# Evaluar mejor modelo
y_pred_opt = best_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_opt))


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Mejores hiperparámetros: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
              precision    recall  f1-score   support

       False       0.98      0.95      0.97       455
        True       0.87      0.93      0.90       145

    accuracy                           0.95       600
   macro avg       0.92      0.94      0.93       600
weighted avg       0.95      0.95      0.95       600



In [7]:
import joblib
import os

# Crear carpeta de modelos si no existe
os.makedirs("models", exist_ok=True)

# Guardar modelo y vectorizador
joblib.dump(best_model, "models/svm_spam_detector.pkl")
joblib.dump(vectorizer, "models/tfidf_vectorizer.pkl")


['models/tfidf_vectorizer.pkl']