In [None]:
import pandas as pd
import re
import spacy

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# 1️⃣ Load Dataset
# Make sure 'spam.csv' has a column like 'message' or 'text'
df = pd.read_csv("/content/spam.csv", encoding="latin-1")










In [None]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# 2️⃣ Define Text Cleaning Function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)      # remove URLs
    text = re.sub(r"<.*?>", "", text)                        # remove HTML tags
    text = re.sub(r"\d+", "", text)                          # remove numbers
    text = re.sub(r"[^\w\s']", "", text)                     # remove punctuation except apostrophes
    text = re.sub(r"\s+", " ", text).strip()                 # remove extra spaces
    return text

In [None]:
# 3️⃣ Apply cleaning
df['cleaned'] = df['Message'].apply(clean_text)


In [None]:
df.head()

Unnamed: 0,Category,Message,cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final ...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don't think he goes to usf he lives arou...


In [None]:
# 4️⃣ Process each cleaned text with spaCy
def spacy_preprocess(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    filtered = [token.text for token in doc if not token.is_stop and token.is_alpha]
    lemmatized = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return {
        "tokens": tokens,
        "filtered": filtered,
        "lemmatized": lemmatized
    }

In [None]:
# Apply spaCy preprocessing
df['processed'] = df['cleaned'].apply(spacy_preprocess)

# 5️⃣ Split the processed dict into separate columns
df['tokens'] = df['processed'].apply(lambda x: x['tokens'])
df['filtered'] = df['processed'].apply(lambda x: x['filtered'])
df['lemmatized'] = df['processed'].apply(lambda x: x['lemmatized'])

# Drop intermediate column
df = df.drop(columns=['processed'])

# 6️⃣ Show sample output
df.head(3)

Unnamed: 0,Category,Message,cleaned,tokens,filtered,lemmatized
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o...","[jurong, point, crazy, available, bugis, n, gr...","[jurong, point, crazy, available, bugis, n, gr..."
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final ...,"[free, entry, in, a, wkly, comp, to, win, fa, ...","[free, entry, wkly, comp, win, fa, cup, final,...","[free, entry, wkly, comp, win, fa, cup, final,..."
