# Explore here

In [6]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


# Load the dataset
url = 'https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv'
data = pd.read_csv(url)

# Check the first few rows of the dataset
print(data.head())


                                                 url  is_spam
0  https://briefingday.us8.list-manage.com/unsubs...     True
1                             https://www.hvper.com/     True
2                 https://briefingday.com/m/v4n3i4f3     True
3   https://briefingday.com/n/20200618/m#commentform    False
4                        https://briefingday.com/fan     True


In [5]:
# Check the column names of the dataset
print(data.columns)


Index(['url', 'is_spam', 'processed_url'], dtype='object')


In [7]:
# Function to preprocess URLs
def preprocess_url(url):
    url = re.sub(r'http[s]?://', '', url)  # Remove http(s)://
    url = re.sub(r'www\.', '', url)  # Remove www.
    url = re.sub(r'\W+', ' ', url)  # Remove non-word characters
    return url

# Apply preprocessing
data['processed_url'] = data['url'].apply(preprocess_url)

# Split the dataset into training and testing sets
X = data['processed_url']
y = data['is_spam']  # Updated target column name
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Create a pipeline with TfidfVectorizer and SVM
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Convert URLs to TF-IDF features
    ('svm', SVC())  # SVM classifier
])

# Train the SVM model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       False       0.96      0.98      0.97       455
        True       0.92      0.88      0.90       145

    accuracy                           0.95       600
   macro avg       0.94      0.93      0.93       600
weighted avg       0.95      0.95      0.95       600



In [9]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['linear', 'rbf']
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

# Train the model with grid search
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the optimized model
y_pred_optimized = grid_search.predict(X_test)
print(classification_report(y_test, y_pred_optimized))


Best Parameters: {'svm__C': 1, 'svm__kernel': 'rbf', 'tfidf__ngram_range': (1, 1)}
Best Score: 0.9404001391788448
              precision    recall  f1-score   support

       False       0.96      0.98      0.97       455
        True       0.92      0.88      0.90       145

    accuracy                           0.95       600
   macro avg       0.94      0.93      0.93       600
weighted avg       0.95      0.95      0.95       600



In [10]:
import joblib

# Save the model
joblib.dump(grid_search.best_estimator_, 'spam_detector_model.pkl')


['spam_detector_model.pkl']