In [6]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
# from textblob import TextBlob
import re


In [7]:
df = pd.read_csv("spam_text.csv")
# df_sample = df.head(5000) 

df_sample = df.groupby("Category").sample(n=500, random_state=42)
# print(df_sample["Category"].value_counts()) 

In [8]:
def process_text(text):
    
    feature = {}
    
    # Normalize and tokenize text
    text = text.lower()
    tokenizer = TreebankWordTokenizer()
    tokens = tokenizer.tokenize(text) 

    # Remove punctuation and stop words
    tokens = [word for word in tokens if word.isalnum()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    feature['text_length'] = len(text)
    feature['num_words'] = len(tokens)
    feature['avg_word_length'] = sum(len(word) for word in tokens) / len(tokens) if len(tokens) > 0 else 0
    
    # Suspicious words count
    suspicious_words = ['free', 'win', 'urgent', 'click', 'buy', 'money', 'offer']
    feature['num_suspicious_words'] = sum(1 for word in tokens if word in suspicious_words)

    # Exclamation and question marks
    feature['num_exclamations'] = text.count('!')
    # feature['num_questions'] = text.count('?')

    return feature

In [9]:
features_df = df_sample['Message'].apply(process_text)

features_df = pd.json_normalize(features_df)

df_sample = df_sample.drop(columns=['Message'])

# Merge extracted features with the sampled dataset
df_sample = df_sample.reset_index(drop=True)  # Resets to default sequential index
features_df = features_df.reset_index(drop=True)  # Resets to default sequential index
df_sample = pd.concat([df_sample, features_df], axis=1)

# Save the cleaned sampled dataset
df_sample.to_csv("sample_spam_text.csv", index=False)

# Display the first few rows of the cleaned dataset
df_sample.head()

Unnamed: 0,Category,text_length,num_words,avg_word_length,num_suspicious_words,num_exclamations
0,ham,84,11,3.363636,0,0
1,ham,103,7,4.142857,0,0
2,ham,32,4,4.0,0,0
3,ham,84,11,3.818182,0,0
4,ham,110,20,3.3,0,0
