## Importing Required Libraries and Dataset

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import string
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', 100)

In [2]:
df = pd.read_csv("spam.csv", encoding = 'latin-1')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives around here though",,,


## Data Preprocessing

In [4]:
# Dropping the unwanted columns 

df = df.drop(labels = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)

In [5]:
# Renaming columns
df.columns = ['label', 'text']

# Encoding the labels column using where function. Spam = 1 and Ham = 0
df['label'] = np.where(df['label'] == 'spam', 1, 0)

In [6]:
stopwords = nltk.corpus.stopwords.words('english')

In [7]:
# Defining a function to clean the messages column

def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation]) # Removing Punctuations
    tokens = re.split('\W+', text) # Splitting String
    text = [word for word in tokens if word not in stopwords] # Removing Stopwords
    return text

In [8]:
# Applying the transformations and adding it as a new column in the dataframe

df['clean_text'] = df['text'].apply(lambda k: clean_text(k))

In [9]:
df.head()

Unnamed: 0,label,text,clean_text
0,0,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[Go, jurong, point, crazy, Available, bugis, n, great, world, la, e, buffet, Cine, got, amore, wat]"
1,0,Ok lar... Joking wif u oni...,"[Ok, lar, Joking, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[Free, entry, 2, wkly, comp, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, 87121, receiv..."
3,0,U dun say so early hor... U c already then say...,"[U, dun, say, early, hor, U, c, already, say]"
4,0,"Nah I don't think he goes to usf, he lives around here though","[Nah, I, dont, think, goes, usf, lives, around, though]"


## Splitting Data into Train and Test

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.20)

In [11]:
X_train

4319                 [Hey, mr, I, going, sea, view, couple, gays, I, mean, games, Give, bell, ya, finish, ]
1324    [Can, call, plz, Your, number, shows, coveragd, area, I, urgnt, call, vasai, amp, reach, 4o, clo...
1487                                                                             [I, told, number, gautham]
334     [Valentines, Day, Special, Win, å, 1000, quiz, take, partner, trip, lifetime, Send, GO, 83600, 1...
2841                                                                                     [aathiwhere, dear]
                                                       ...                                                 
2008                                                                      [See, forwarding, message, proof]
2472    [Final, Chance, Claim, ur, å, 150, worth, discount, vouchers, today, Text, YES, 85023, SavaMob, ...
3053                                                                             [What, happened, yo, date]
2279                        

In [12]:
y_train

4319    0
1324    0
1487    0
334     1
2841    0
       ..
2008    0
2472    1
3053    0
2279    0
5363    0
Name: label, Length: 4457, dtype: int32

### Exporting the train and test files for further Processing and Building the Model

In [13]:
X_train.to_csv('X_train.csv', index = False, header=True)
X_test.to_csv('X_test.csv', index = False, header=True)
y_train.to_csv('y_train.csv', index = False, header=True)
y_test.to_csv('y_test.csv', index = False, header=True)