In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

## 1. Upload data - Spam classification dataset

In [2]:
messages = pd.read_csv("C:\\Users\\Chiara\\Desktop\\LEARN\\datasets\\SMSSpamCollection.txt", sep="\t", names=["label", "message"])

In [3]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
messages["message"].loc[100]

"Please don't text me anymore. I have nothing else to say."

## 2. Pre-processing and cleaning

I apply:

* Tokenization
* Stopwords
* Lemmatization/Stemming

In [5]:
import re
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chiara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()  # for stemming

In [7]:
corpus = []

for i in range(0, len(messages)):

    # Tozenization, elimination of punctuation and lowering
    review = re.sub("[^a-zA-Z0-9]", " ", messages["message"][i])
    review = review.lower()
    review = review.split()

    # Apply stopwords and stemming
    review = [ps.stem(word) for word in review if not word in stopwords.words("english")]
    review = " ".join(review)
    corpus.append(review)

In [8]:
corpus[3]

'u dun say earli hor u c alreadi say'

## 3. First approach: BoW and modeling zith Multinomial Naive Bayes

In [9]:
# BoW model

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2500, binary=True, ngram_range=(2,2))
X = cv.fit_transform(corpus).toarray()

In [10]:
np.shape(X)

(5572, 2500)

In [11]:
# Labels as dummy variables

y = pd.get_dummies(messages["label"])
y = (y.iloc[:,1].values).astype(int)

In [12]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [13]:
# Train and test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [14]:
print(X_train.shape)
print(y_train.shape)

(4457, 2500)
(4457,)


In [15]:
print(X_test.shape)
print(y_test.shape)

(1115, 2500)
(1115,)


In [16]:
# Multinomial Naive Bayes

from sklearn.naive_bayes import MultinomialNB

spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [17]:
# Prediction and performance

from sklearn.metrics import accuracy_score, classification_report

y_pred = spam_detect_model.predict(X_test)

score = accuracy_score(y_test, y_pred)
print("Accuracy: ", score)

print("Classification report:")
print(classification_report(y_test, y_pred))

Accuracy:  0.9713004484304932
Classification report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       955
           1       1.00      0.80      0.89       160

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [19]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[955,   0],
       [ 32, 128]])

ham = 0, spam = 1; so 32 spams have been considered as ham.

## 4. Second approach: TFIDF and Multinomial Naive Bayes

In [20]:
# TFIDF

from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(max_features=2500, ngram_range=(1,2))
X = tv.fit_transform(corpus).toarray()

In [21]:
# Train the model with Multinomial NB

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [22]:
# Performances

y_pred = spam_detect_model.predict(X_test)

score = accuracy_score(y_test, y_pred)
print("Accuracy: ", score)

print("Classification report:")
print(classification_report(y_test, y_pred))

print("Confusion matrix:")
confusion_matrix(y_test, y_pred)

Accuracy:  0.9811659192825112
Classification report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       1.00      0.87      0.93       160

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Confusion matrix:


array([[955,   0],
       [ 21, 139]])

Slightly improved!

## 5. Third approach: TFIDF and Random forest

TFIDF already performed in the previous cells.

In [23]:
# Random forest

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [24]:
# Prediction and performance

y_pred = classifier.predict(X_test)

score = accuracy_score(y_test, y_pred)
print("Accuracy: ", score)

print("Classification report:")
print(classification_report(y_test, y_pred))

print("Confusion matrix:")
confusion_matrix(y_test, y_pred)

Accuracy:  0.9838565022421525
Classification report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       1.00      0.89      0.94       160

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.97      1115
weighted avg       0.98      0.98      0.98      1115

Confusion matrix:


array([[955,   0],
       [ 18, 142]])

Slightly better than the multinomial NB.

## 6. Fourth approach: text pre-processing, Average Word2Vec from scratch and Logistic regression

In [25]:
# Tokenizarion, stopwords and lemmatization

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

corpus = []
for i in range(0, len(messages)):

    # Tozenization, elimination of punctuation and lowering
    review = re.sub("[^a-zA-Z0-9]", " ", messages["message"][i])
    review = review.lower()
    review = review.split()

    # Stopwords and lemmatization
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words("english")]
    review = " ".join(review)
    corpus.append(review)

In [26]:
corpus[3]

'u dun say early hor u c already say'

In [27]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess # Convert qa document into a list of lower case tokens

words = [simple_preprocess(text) for text in corpus]

In [28]:
words[3]

['dun', 'say', 'early', 'hor', 'already', 'say']

In [29]:
# Word2vec from scratch

import gensim

model = gensim.models.Word2Vec(words, window=5, min_count=2)  # Size 100 by default

In [30]:
# Vocabulary

#model.wv.index_to_key

In [31]:
# Vocabylary and epochs

print("Vocabulary size: ", model.corpus_count)
print("Epochs: ", model.epochs)

Vocabulary size:  5572
Epochs:  5


In [32]:
model.wv.similar_by_word("happy")

[('day', 0.9994533061981201),
 ('make', 0.9994187951087952),
 ('amp', 0.9994057416915894),
 ('one', 0.9994034171104431),
 ('dont', 0.9994032979011536),
 ('wish', 0.9994021654129028),
 ('new', 0.999395489692688),
 ('said', 0.9993944764137268),
 ('dear', 0.9993823766708374),
 ('love', 0.9993805885314941)]

In [33]:
model.wv["kid"].shape

(100,)

In [34]:
# Average Word2Vec

def avg_word2vec(doc):

    vectors = [model.wv[word] for word in doc if word in model.wv]

    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [35]:
X = []

for i in tqdm(range(len(words))): # for each sentence
    X.append(avg_word2vec(words[i]))

100%|███████████████████████████████████████████████████████████████████████████| 5572/5572 [00:00<00:00, 55876.72it/s]


In [36]:
print(type(X))

<class 'list'>


In [37]:
X = np.vstack(X)
X[0]

array([-0.12471646,  0.20983069,  0.11651173,  0.05851023,  0.10515637,
       -0.41073895,  0.04500198,  0.44296262, -0.14316042, -0.10563594,
       -0.20258243, -0.36524951, -0.02799961,  0.11688239,  0.041252  ,
       -0.21605468, -0.01283775, -0.42153823, -0.04947832, -0.48779514,
        0.11298674,  0.1156339 ,  0.06432674, -0.13314167, -0.10801759,
        0.01067744, -0.24940294, -0.23420013, -0.2164588 ,  0.02139685,
        0.32857099,  0.05859922,  0.10283095, -0.14979954, -0.09267747,
        0.24537762, -0.02695324, -0.23060995, -0.22468437, -0.42096639,
       -0.08085812, -0.24301492, -0.02987061,  0.08050624,  0.20815186,
       -0.12030384, -0.16047259, -0.06198927,  0.13330367,  0.22599016,
        0.11328699, -0.28815445, -0.12276155, -0.04000972, -0.21454507,
        0.10115194,  0.16603956,  0.05688257, -0.33834636,  0.04537171,
        0.04124367,  0.10241794, -0.12286913,  0.05219069, -0.26418579,
        0.18314606,  0.10900255,  0.17516363, -0.3174037 ,  0.32

In [38]:
X[5].shape

(100,)

All the sentences now are represented by an array of 100 dimensions.

In [39]:
# Train and test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(X_train.shape, X_test.shape)

(4457, 100) (1115, 100)


In [40]:
# Random forest

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)
lr.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [41]:
# Predictions and accuracy

y_pred = lr.predict(X_test)

score = accuracy_score(y_test, y_pred)
print("Accuracy: ", score)

print("Classification report:")
print(classification_report(y_test, y_pred))

print("Confusion matrix:")
confusion_matrix(y_test, y_pred)

Accuracy:  0.8914798206278027
Classification report:
              precision    recall  f1-score   support

           0       0.98      0.90      0.93       966
           1       0.56      0.86      0.68       149

    accuracy                           0.89      1115
   macro avg       0.77      0.88      0.81      1115
weighted avg       0.92      0.89      0.90      1115

Confusion matrix:


array([[866, 100],
       [ 21, 128]])