In [None]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
nltk.download('stopwords')


In [None]:
df = pd.read_csv("spam_text.csv")
df_sample = df.head(1000) 


In [None]:
def process_text(text):
    text = text.lower()

    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])

    # Tokenization (splitting into words)
    tokens = text.split()

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming (reduce words to their root form)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Return tokens joined by spaces
    return ' '.join(tokens)  # This ensures words are separated by spaces

# Apply to the dataset
df_sample['processed_text'] = df_sample['Message'].apply(process_text)

In [None]:
# Vectorization (Convert text to numerical format using TF-IDF)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_sample['processed_text'])  # Corrected column name

# Label Encoding (Convert labels to numeric values)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_sample['Category'])  # Ensure the column name is correct

# Save the processed data to a new CSV file
df_sample.to_csv("sample_spam_text.csv", index=False)

print(df_sample.head())

In [None]:
X = df_sample['processed_text']
y = df_sample['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)