## sms spam detector

### setup

In [17]:
%pip install pandas
%pip install scikit-learn
%pip install nltk

Note: you may need to restart the kernel to use updated packages.
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp313-cp313-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.15.2-cp313-cp313-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp313-cp313-macosx_12_0_arm64.whl (11.1 MB)
Using cached scipy-1.15.2-cp313-cp313-macosx_14_0_arm64.whl (22.4 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.6.1 scipy-1.15.2 threadpoolctl-3.6.0
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### data prepatation

In [12]:
import pandas as pd

# load datasets

train_df = pd.read_csv('./datasets/sms_train.csv', encoding='latin-1')
test_df = pd.read_csv('./datasets/sms_test.csv', encoding='latin-1')

In [13]:
# drop unnecessary columns

train_df = train_df[['Message_body', 'Label']]
test_df = test_df[['Message_body', 'Label']]


# rename columns

train_df = train_df.rename(columns={'Message_body': 'text', 'Label': 'label'})
test_df = test_df.rename(columns={'Message_body': 'text', 'Label': 'label'})


# map labels to integers

label_mapping = {
    'Non-Spam': 0,
    'Spam': 1
}

train_df['label'] = train_df['label'].map(label_mapping)
test_df['label'] = test_df['label'].map(label_mapping)

### text preprocessing

In [37]:
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download("stopwords")
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

# preprocess text

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess_text(text):
    # convert to lowercase
    text = text.lower()
    # remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # tokenization
    tokens = word_tokenize(text)

    # stop words filtering
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # pos tagging
    pos_tags = nltk.pos_tag(tokens)

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(pos)) for word, pos in pos_tags]

    return ' '.join(tokens)

train_df['preprocessed'] = train_df['text'].apply(preprocess_text)
test_df['preprocessed'] = test_df['text'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /Users/cyrus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/cyrus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/cyrus/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/cyrus/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### splitting dataset

In [38]:
from sklearn.model_selection import train_test_split

merge_df = pd.concat([train_df, test_df], ignore_index=True)

X_train, X_test, y_train, y_test = train_test_split(
    merge_df['preprocessed'],
    merge_df['label'],
    test_size=0.2,
    random_state=42,
    stratify=merge_df['label']
)

### vectorization

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(max_features=5000)

X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
