In [10]:
#Here we will go through a step by step process for creating a SMS Spam ML model
#First we need to be able to read the data
import pandas as pd

data = pd.read_csv('SMSSpamCollection', sep = '\t', header=None, names = ['label', 'message'])
#this dataset is a tab-seperated file with no header, so we specify the seperator and provide column names
print(data.head()) #Gives the first 5 rows of the dataset
print("\nLabel counts:")
print(data['label'].value_counts()) #Counts the number of spam and ham messages

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

Label counts:
label
ham     4825
spam     747
Name: count, dtype: int64


In [11]:
#Lets now preprocess the data i.e we need to clean and prepare the text data for modeling
import string
from nltk.corpus import stopwords #stopwords are common words that do not add much meaning to the text. Example: "the", "is", "in", etc.
from nltk.stem import PorterStemmer #Porter Stemmer is a common stemming algorithm that reduces words to their root form. Example: running -> run

import nltk
nltk.download('stopwords')

def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation]) #Remove punctuation. why do we add '' at the start?
    #Because we want to join the characters back together into a string.
    words = text.split() #Split the text into words

    #Remove stopwords and stem the words
    stop_words = set(stopwords.words('english')) #Get the set of stopwords in English
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words if word not in stop_words] #Stem the words and remove stop words

    return ' '.join(words) #Join the words back together into a string
#So let's say something like "I am running" will become "run" after preprocessing
#Now we apply preprocessing 
data['message'] = data['message'].apply(preprocess_text)
print(data.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saket\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  label                                            message
0   ham  go jurong point crazi avail bugi n great world...
1   ham                              ok lar joke wif u oni
2  spam  free entri 2 wkli comp win fa cup final tkt 21...
3   ham                u dun say earli hor u c alreadi say
4   ham          nah dont think goe usf live around though


In [None]:
#Now we need to convert the extracted text data into numerical features that can be used by the ML model
from sklearn.feature_extraction.text import TfidfVectorizer #TF-IDF stands for Term Frequency-Inverse Document Frequency
#It is a numerical representation of text data that reflects the importance of a word in a document relative to a collection of documents.

tfidf = TfidfVectorizer(max_features=5000) #We will uses the top 5000 words in the dataset
X = tfidf.fit_transform(data['processed_messasge']).toarray() #This will convert the text data into a sparse matrix of TF-IDF features 
y = data['label'].map({'ham': 0, 'spam': 1}) #Coverts labels to 0/1

KeyError: 'processed_messasge'