## sms spam detector

### setup

In [None]:
%pip install pandas
%pip install scikit-learn
%pip install nltk

### data prepatation

In [None]:
import pandas as pd

# load datasets

train_df = pd.read_csv('./datasets/sms_train.csv', encoding='latin-1')
test_df = pd.read_csv('./datasets/sms_test.csv', encoding='latin-1')

In [None]:
# drop unnecessary columns

train_df = train_df[['Message_body', 'Label']]
test_df = test_df[['Message_body', 'Label']]


# rename columns

train_df = train_df.rename(columns={'Message_body': 'text', 'Label': 'label'})
test_df = test_df.rename(columns={'Message_body': 'text', 'Label': 'label'})


# map labels to integers

label_mapping = {
    'Non-Spam': 0,
    'Spam': 1
}

train_df['label'] = train_df['label'].map(label_mapping)
test_df['label'] = test_df['label'].map(label_mapping)

### text preprocessing

In [None]:
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download("stopwords")
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

# preprocess text

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess_text(text):
    # convert to lowercase
    text = text.lower()
    # remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # tokenization
    tokens = word_tokenize(text)

    # stop words filtering
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # pos tagging
    pos_tags = nltk.pos_tag(tokens)

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(pos)) for word, pos in pos_tags]

    return ' '.join(tokens)

train_df['preprocessed'] = train_df['text'].apply(preprocess_text)
test_df['preprocessed'] = test_df['text'].apply(preprocess_text)

### splitting dataset

In [None]:
from sklearn.model_selection import train_test_split

merge_df = pd.concat([train_df, test_df], ignore_index=True)

X_train, X_test, y_train, y_test = train_test_split(
    merge_df['preprocessed'],
    merge_df['label'],
    test_size=0.2,
    random_state=42,
    stratify=merge_df['label']
)

### vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(max_features=5000)

X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)


In [None]:
print("train:", X_train_vectorized.shape, y_train.shape)
print("test:", X_test_vectorized.shape, y_test.shape)

### model training and evaluation

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

performances = {}

def train_evaluate_model(model_name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    performances[model_name] = {
        'model': model,
        'accuracy': accuracy,
        'report': report,
        'confusion_matrix': cm
    }

    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)
    print("Confusion Matrix:")
    print(cm)
    print("\n" + "="*50 + "\n")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Linear SVM': LinearSVC(),
    'Random Forest': RandomForestClassifier(n_estimators=100)
}

for model_name, model in models.items():
    train_evaluate_model(model_name, model, X_train_vectorized, y_train, X_test_vectorized, y_test)

# best: linear svm, random forest

### hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# for linear svm
param_grid_svc = {
    'C': [0.1, 1, 10, 100],
    'max_iter': [1000, 2000, 3000]
}

grid_search_svc = GridSearchCV(
    LinearSVC(),
    param_grid_svc,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_search_svc.fit(X_train_vectorized, y_train)

# for random forest
param_grid_rfc = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search_rfc = GridSearchCV(
    RandomForestClassifier(),
    param_grid_rfc,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_search_rfc.fit(X_train_vectorized, y_train)

In [None]:
print("linear svm")
print("best params:", grid_search_svc.best_params_)
print("best score:", grid_search_svc.best_score_)
print("\n" + "="*50 + "\n")
print("random forest")
print("best params:", grid_search_rfc.best_params_)
print("best score:", grid_search_rfc.best_score_)

# best model: linear svm

In [None]:
best_model = grid_search_svc.best_estimator_
y_pred = best_model.predict(X_test_vectorized)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("acuuracy:", accuracy)
print("classification report:")
print(report)
print("confusion matrix:")
print(cm)

### model persistence

In [None]:
import pickle
import os

os.makedirs('model', exist_ok=True)

with open('model/best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
with open('model/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)