# **Text Preprocessing with spam data:**

In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import string, re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize.regexp import WordPunctTokenizer, WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, ne_chunk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [43]:
data = pd.read_csv('spam.csv',  encoding="ISO-8859-1", usecols=['v1', 'v2'])
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [44]:
# checking the count of the dependent variable:

data['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

# **1. convert to lower case:**

In [45]:
def convert_lower(data, col):
  data[col+'_lower'] = data[col].apply(lambda x: x.lower())
  data = data.drop(axis=1, columns=[col], inplace=True)

convert_lower(data, 'v2')
data.head()

Unnamed: 0,v1,v2_lower
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


# **2. Remove The Punctuation:**

In [46]:
def remove_punctuation(data, col):
  translator = str.maketrans('', '', string.punctuation)
  data['clean_msg'] = data[col].apply(lambda x: x.translate(translator))
  data = data.drop(axis=1, columns=[col], inplace=True)

remove_punctuation(data, 'v2_lower')
data.head()

Unnamed: 0,v1,clean_msg
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...


# **3. Tokenizations:**

In [47]:
def convert_tokens(data, col):
  data['tokens'] = data[col].apply(lambda x: word_tokenize(x))
  data = data.drop(axis=1, columns=[col], inplace=True)

convert_tokens(data, 'clean_msg')
data.head()

Unnamed: 0,v1,tokens
0,ham,"[go, until, jurong, point, crazy, available, o..."
1,ham,"[ok, lar, joking, wif, u, oni]"
2,spam,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"[nah, i, dont, think, he, goes, to, usf, he, l..."


# **4.  Remove Stop of Words:**

In [48]:
def remove_stopwords(data, col):
  stop_words = set(stopwords.words('english'))
  data['clean_msg'] = data[col].apply(lambda x: [word for word in x if word not in stop_words])
  data.drop(axis=1, columns=[col], inplace=True)

remove_stopwords(data, 'tokens')
data.head()

Unnamed: 0,v1,clean_msg
0,ham,"[go, jurong, point, crazy, available, bugis, n..."
1,ham,"[ok, lar, joking, wif, u, oni]"
2,spam,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"[nah, dont, think, goes, usf, lives, around, t..."


# **5. Apply Stemming:**

In [49]:
def Stemming(data, col):
  porter = PorterStemmer()
  data['steam_data'] = data[col].apply(lambda x: [porter.stem(word) for word in x])
  data = data.drop(axis=1, columns=[col], inplace=True)

Stemming(data, 'clean_msg')
data.head()

Unnamed: 0,v1,steam_data
0,ham,"[go, jurong, point, crazi, avail, bugi, n, gre..."
1,ham,"[ok, lar, joke, wif, u, oni]"
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,ham,"[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"[nah, dont, think, goe, usf, live, around, tho..."


# **6. Apply Lemmatizations:**

In [50]:
def lemmatization(data, col):
  lemma = WordNetLemmatizer()
  data['lemmatize_data'] = data[col].apply(lambda x: [lemma.lemmatize(word, pos='v') for word in x])
  data.drop(axis=1, columns=[col], inplace=True)

lemmatization(data, 'steam_data')
data.head()

Unnamed: 0,v1,lemmatize_data
0,ham,"[go, jurong, point, crazi, avail, bugi, n, gre..."
1,ham,"[ok, lar, joke, wif, u, oni]"
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,ham,"[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"[nah, dont, think, goe, usf, live, around, tho..."
