In [2]:
# Download dataset
# https://drive.google.com/file/d/1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R/view?usp=sharing
!gdown --id 1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R

Downloading...
From: https://drive.google.com/uc?id=1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R
To: /content/2cls_spam_text_cls.csv
100% 486k/486k [00:00<00:00, 95.5MB/s]


In [8]:
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [9]:
DATASET_PATH = '/content/2cls_spam_text_cls.csv'
df = pd.read_csv(DATASET_PATH)
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
messages = df['Message'].values.tolist()
labels = df['Category'].values.tolist()

# Preprocessing text
- Lowercase
- Punctation removal
- Tokenize
- Remove stopwords: filter out common words that don't carry significant meaning
- Stemming: reduce words to their root forms, grouping similar words together

In [12]:
def lowercase(text):
  return text.lower()

def remove_punctuation(text):
  return text.translate(str.maketrans('', '', string.punctuation))

def tokenize(text):
  return nltk.word_tokenize(text)

def remove_stopwords(tokens):
  stop_words = nltk.corpus.stopwords.words('english')
  return [word for word in tokens if word not in stop_words]

def stemming(tokens):
  stemmer = nltk.stem.PorterStemmer()
  return [stemmer.stem(word) for word in tokens]

def preprocess_text(text):
  text = lowercase(text)
  text = remove_punctuation(text)
  tokens = tokenize(text)
  tokens = remove_stopwords(tokens)
  tokens = stemming(tokens)
  return tokens

messages = [preprocess_text(message) for message in messages]

In [14]:
# Build a dictionary of tokens in messages
def create_dictionary(messages):
    dictionary = []
    for message in messages:
        for word in message:
            if word not in dictionary:
                dictionary.append(word)
    return dictionary

dictionary = create_dictionary(messages)

In [18]:
# Create features for token: token frequency in message
def create_features(tokens, dictionary):
  features = np.zeros(len(dictionary))
  for token in tokens:
    if token in dictionary:
      features[dictionary.index(token)] += 1
  return features

X = np.array([create_features(message, dictionary) for message in messages])

In [17]:
# Preprocess label
le = LabelEncoder()
y = le.fit_transform(labels)
print(f'Classes: {le.classes_}')
print(f'Encoded labels: {y}')

Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]


In [19]:
# Split train/validation/test dataset percentage: 7/2/1
VAL_SIZE = 0.2
TEST_SIZE = 0.125
SEED = 0

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SIZE, random_state=SEED)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=TEST_SIZE, random_state=SEED)

In [20]:
model = GaussianNB()
print('Start training')
model = model.fit(X_train, y_train)
print('Training done')

Start training
Training done


In [21]:
# Evaluate model
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)
print(f'Validation accuracy: {accuracy_score(y_val, y_val_pred)}')
print(f'Test accuracy: {accuracy_score(y_test, y_test_pred)}')

Validation accuracy: 0.8816143497757848
Test accuracy: 0.8602150537634409


In [22]:
def predict(text, model, dictionary):
  processed_text = preprocess_text(text)
  features = create_features(processed_text, dictionary)
  features = features.reshape(1, -1)
  prediction = model.predict(features)
  prediction_cls = le.inverse_transform(prediction)[0]
  return prediction_cls

test_input = "I am actually doing something useful"
prediction = predict(test_input, model, dictionary)
print(f'Prediction: {prediction}')

Prediction: ham
