# Sistema de detección de enlaces spam

## Paso 1: Carga del conjunto de datos desde internet

In [None]:
import pandas as pd

# Carga del conjunto de datos desde internet
url = "https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv"
df = pd.read_csv(url)

# Mostrar primeras filas
print(df.head())

print(df.columns)

                                                 url  is_spam
0  https://briefingday.us8.list-manage.com/unsubs...     True
1                             https://www.hvper.com/     True
2                 https://briefingday.com/m/v4n3i4f3     True
3   https://briefingday.com/n/20200618/m#commentform    False
4                        https://briefingday.com/fan     True
Index(['url', 'is_spam'], dtype='object')


## Paso 2: Preprocesa los enlaces

In [None]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


# Crea una función para segmentar las URLs en partes según sus signos de puntuación, elimina las stopwords, lematiza, etcétera.

def procesar_url(url):
    # 1. Separar por signos de puntuación
    partes = re.split(r'\W+', url.lower())  # \W+ = no palabras (puntos, /, -, etc.)

    # 2. Eliminar palabras vacías (stopwords)
    stop_words = set(stopwords.words('english'))
    partes_filtradas = [p for p in partes if p and p not in stop_words]

    # 3. Lematizar
    lemmatizer = WordNetLemmatizer()
    lematizadas = [lemmatizer.lemmatize(p) for p in partes_filtradas]

    return lematizadas

# Aplicar la función a todas las URLs
df['processed'] = df['url'].apply(preprocess_url)

# Dividir en X (datos) e y (etiquetas)
X = df['processed']
y = df['is_spam']

# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/vscode/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Paso 3: Construye un SVM

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Crear un pipeline: convierte texto y entrena modelo
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC())
])

# Entrenar
model.fit(X_train, y_train)

# Evaluar
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

       False       0.96      0.97      0.97       455
        True       0.91      0.88      0.89       145

    accuracy                           0.95       600
   macro avg       0.93      0.92      0.93       600
weighted avg       0.95      0.95      0.95       600



## Paso 4: Optimiza el modelo anterior

In [14]:
from sklearn.model_selection import GridSearchCV

# Opciones de búsqueda
params = {
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['linear', 'rbf']
}

# Grid Search
grid = GridSearchCV(model, params, cv=5)
grid.fit(X_train, y_train)

# Mejores resultados
print("Mejores parámetros:", grid.best_params_)

# Evaluar modelo optimizado
predictions = grid.predict(X_test)
print(classification_report(y_test, predictions))

# Guardamos el mejor modelo
best_model = grid.best_estimator_


Mejores parámetros: {'svm__C': 1, 'svm__kernel': 'rbf'}
              precision    recall  f1-score   support

       False       0.96      0.97      0.97       455
        True       0.91      0.88      0.89       145

    accuracy                           0.95       600
   macro avg       0.93      0.92      0.93       600
weighted avg       0.95      0.95      0.95       600



## Paso 5: Guarda el modelo

In [15]:
import joblib

# Guardar modelo
joblib.dump(best_model, 'modelo_spam.pkl')
print("Modelo guardado como modelo_spam.pkl")


Modelo guardado como modelo_spam.pkl
