In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
dataset = pd.read_csv("/content/spam_ham_dataset.csv")

# Data Preprocessing: Focus on text and labels
data = dataset[['text', 'label_num']]
data['text'] = data['text'].str.lower()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label_num'], test_size=0.2, random_state=42)

# Vectorization: TF-IDF with n-grams (bigrams and trigrams)
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 3))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Hyperparameter tuning for Logistic Regression
param_grid_lr = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}
grid_search_lr = GridSearchCV(LogisticRegression(), param_grid_lr, cv=5, scoring='accuracy')
grid_search_lr.fit(X_train_tfidf, y_train)

# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=3, scoring='accuracy')
grid_search_rf.fit(X_train_tfidf, y_train)

# Evaluate both tuned models on the test set
best_lr = grid_search_lr.best_estimator_
best_rf = grid_search_rf.best_estimator_

# Predictions
y_pred_lr = best_lr.predict(X_test_tfidf)
y_pred_rf = best_rf.predict(X_test_tfidf)

# Calculate accuracy
accuracy_lr = accuracy_score(y_test, y_pred_lr)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# Print the best models' accuracy and classification reports
print(f'Logistic Regression Accuracy: {accuracy_lr}')
print(classification_report(y_test, y_pred_lr, target_names=['ham', 'spam']))

print(f'Random Forest Accuracy: {accuracy_rf}')
print(classification_report(y_test, y_pred_rf, target_names=['ham', 'spam']))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].str.lower()


Logistic Regression Accuracy: 0.9893719806763285
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       742
        spam       0.98      0.98      0.98       293

    accuracy                           0.99      1035
   macro avg       0.99      0.99      0.99      1035
weighted avg       0.99      0.99      0.99      1035

Random Forest Accuracy: 0.9777777777777777
              precision    recall  f1-score   support

         ham       0.99      0.98      0.98       742
        spam       0.94      0.98      0.96       293

    accuracy                           0.98      1035
   macro avg       0.97      0.98      0.97      1035
weighted avg       0.98      0.98      0.98      1035



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle


In [3]:
# Load the dataset
dataset = pd.read_csv('/content/spam_ham_dataset.csv')

# Focus on text and labels
data = dataset[['text', 'label_num']]

# Text Preprocessing: Convert text to lowercase
data['text'] = data['text'].str.lower()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label_num'], test_size=0.2, random_state=42)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].str.lower()


In [4]:
# Vectorization: TF-IDF with n-grams (bigrams and trigrams)
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 3))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [5]:
# Hyperparameter tuning for Logistic Regression
param_grid_lr = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}
grid_search_lr = GridSearchCV(LogisticRegression(), param_grid_lr, cv=5, scoring='accuracy')
grid_search_lr.fit(X_train_tfidf, y_train)

# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=3, scoring='accuracy')
grid_search_rf.fit(X_train_tfidf, y_train)

# Get the best models from grid search
best_lr = grid_search_lr.best_estimator_
best_rf = grid_search_rf.best_estimator_


In [6]:
# Predictions
y_pred_lr = best_lr.predict(X_test_tfidf)
y_pred_rf = best_rf.predict(X_test_tfidf)

# Calculate accuracy
accuracy_lr = accuracy_score(y_test, y_pred_lr)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# Print the results
print(f'Logistic Regression Accuracy: {accuracy_lr}')
print(classification_report(y_test, y_pred_lr, target_names=['ham', 'spam']))

print(f'Random Forest Accuracy: {accuracy_rf}')
print(classification_report(y_test, y_pred_rf, target_names=['ham', 'spam']))


Logistic Regression Accuracy: 0.9893719806763285
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       742
        spam       0.98      0.98      0.98       293

    accuracy                           0.99      1035
   macro avg       0.99      0.99      0.99      1035
weighted avg       0.99      0.99      0.99      1035

Random Forest Accuracy: 0.9777777777777777
              precision    recall  f1-score   support

         ham       0.99      0.98      0.98       742
        spam       0.94      0.98      0.96       293

    accuracy                           0.98      1035
   macro avg       0.97      0.98      0.97      1035
weighted avg       0.98      0.98      0.98      1035



In [7]:
# Save the best model
with open('spam_classifier_model.pkl', 'wb') as model_file:
    pickle.dump(best_lr, model_file)  # or best_rf if Random Forest performed better

# Save the vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)
