In [11]:
import pandas as pd
import string
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adriendavidson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adriendavidson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adriendavidson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Convert label to numeric

In [4]:
df = df = pd.read_csv("/Users/adriendavidson/Documents/Davidson/sms-spam-nlp/data/spam.csv", usecols = [0, 1], encoding = 'latin-1')  
df['label'] = df['v1'].map({'ham': 0,
                           'spam': 1})

### Clean the text

In [8]:
def clean_text(text):
    text = text.lower()
    text = ''.join([c for c in text if c not in string.punctuation])
    text = ''.join([c for c in text if not c.isdigit()])
    text = text.strip()
    return text

df['clean_text'] = df['v2'].apply(clean_text)
df[['clean_text']].head()

Unnamed: 0,clean_text
0,go until jurong point crazy available only in ...
1,ok lar joking wif u oni
2,free entry in a wkly comp to win fa cup final...
3,u dun say so early hor u c already then say
4,nah i dont think he goes to usf he lives aroun...


### Tokenization

In [13]:
df['tokens'] = df['clean_text'].apply(word_tokenize)
df[['tokens']].head()

Unnamed: 0,tokens
0,"[go, until, jurong, point, crazy, available, o..."
1,"[ok, lar, joking, wif, u, oni]"
2,"[free, entry, in, a, wkly, comp, to, win, fa, ..."
3,"[u, dun, say, so, early, hor, u, c, already, t..."
4,"[nah, i, dont, think, he, goes, to, usf, he, l..."


### Remove stopwords

In [16]:
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda tokens: [w for w in tokens if w not in stop_words])
df[['tokens']].head()

Unnamed: 0,tokens
0,"[go, jurong, point, crazy, available, bugis, n..."
1,"[ok, lar, joking, wif, u, oni]"
2,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,"[u, dun, say, early, hor, u, c, already, say]"
4,"[nah, dont, think, goes, usf, lives, around, t..."


### Lemmatization

In [19]:
lemmatizer = WordNetLemmatizer()

df['tokens'] = df['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(w) for w in tokens])
df[['tokens']].head()

Unnamed: 0,tokens
0,"[go, jurong, point, crazy, available, bugis, n..."
1,"[ok, lar, joking, wif, u, oni]"
2,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,"[u, dun, say, early, hor, u, c, already, say]"
4,"[nah, dont, think, go, usf, life, around, though]"


### Rejoin tokens for modeling

In [22]:
df['processed_text'] = df['tokens'].apply(lambda tokens: ' '.join(tokens))
df[['processed_text']].head()

Unnamed: 0,processed_text
0,go jurong point crazy available bugis n great ...
1,ok lar joking wif u oni
2,free entry wkly comp win fa cup final tkts st ...
3,u dun say early hor u c already say
4,nah dont think go usf life around though


### Save preprocessed data

In [25]:
df[['label', 'processed_text']].to_csv('/Users/adriendavidson/Documents/Davidson/sms-spam-nlp/data/sms_preprocessed.csv', index = False)