# Dependency Installation and Repository Cloning

### Run it if you're using this notebook in Google Colab

In [None]:
!git clone 'https://github.com/dakopecky/nlp-course-itmo.git'

%cd nlp-course-itmo
!git checkout hw3
%cd hw3

!pip install poetry
!poetry config virtualenvs.create false
!poetry install --no-ansi

# Dataset Loading and Preprocessing

Import deps

In [2]:
# This code includes software developed by the following open-source projects:
# - Pandas (License: BSD-3-Clause License, Authors: Pandas Development Team)
# - tqdm (License: MIT License, Authors: Noam Yorav-Raphael)
# - scikit-learn (License: BSD License, Authors: scikit-learn Developers)
# - nltk (License: Apache License 2.0, Authors: NLTK Project)
# - numpy (License: BSD-3-Clause license, Authors: NumPy Developers)
# - gensim (License: LGPL-2.1 License, Authors: Radim Rehurek, Petr Sojka and Gensim Contributors)
# - Jupyter Notebook (License: Modified BSD License, Authors: Project Jupyter)
# For the full license information, please see the `licenses` directory.


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
from tqdm import tqdm
from gensim.models import FastText
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Download the "Spam or not spam" dataset

In [3]:
!wget -q 'https://www.dropbox.com/scl/fi/8xeu3og5umjc7hfalrntu/spam_or_not_spam.csv?rlkey=utzaie3ti891ba80pbaryefzx&dl=1' -O 'spam-or-not-spam.csv'

Load the dataset

In [4]:
dataset_path = 'spam-or-not-spam.csv'
df = pd.read_csv(dataset_path)
df.dropna(inplace=True)
df.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


Ensure reproducibility

In [5]:
RANDOM_STATE = 42

Split the dataset into training and test sets

In [6]:
X = df['email']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)

Text Preprocessing: Lemmatization & Stopwords Removal

In [None]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

In [8]:
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

X_train_preprocessed = pd.Series(dtype='object')
X_test_preprocessed = pd.Series(dtype='object')

for index, content in tqdm(X_train.items(), total=X_train.shape[0], desc='Preprocessing Train Set'):
    X_train_preprocessed.at[index] = preprocess_text(content)

for index, content in tqdm(X_test.items(), total=X_test.shape[0], desc='Preprocessing Test Set'):
    X_test_preprocessed.at[index] = preprocess_text(content)

X_train_preprocessed.head()

Preprocessing Train Set: 100%|██████████| 2249/2249 [00:18<00:00, 121.55it/s]
Preprocessing Test Set: 100%|██████████| 750/750 [00:02<00:00, 315.83it/s]


1064    title page ha login screen seem get apt index ...
662     thu number sep number mr fork wrote think went...
480     recently stumbled across content journal elect...
2136    url url date number number numbertnumber numbe...
2702    help find fund ltc alternative marketing agent...
dtype: object

# Vectorization

Word2Vec Vectorization (SkipGram & CBOW)

In [9]:
worker_num=1 # for reproducibility of results

In [10]:
word2vec_sg = Word2Vec(sentences=[sentence.split() for sentence in X_train_preprocessed], vector_size=100, window=5, sg=1, min_count=1, workers=worker_num, seed=RANDOM_STATE)
word2vec_cbow = Word2Vec(sentences=[sentence.split() for sentence in X_train_preprocessed], vector_size=100, window=5, sg=0, min_count=1, workers=worker_num, seed=RANDOM_STATE)

FastText Vectorization

In [11]:
fasttext_model = FastText(sentences=[sentence.split() for sentence in X_train_preprocessed], vector_size=100, window=5, min_count=1, workers=worker_num, seed=RANDOM_STATE)

Create Sentence Vectors for Training and Test Sets

In [13]:
def sentence_vector(sentence, model):
    words = sentence.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]

    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)

    word_vectors = np.array(word_vectors)
    return word_vectors.mean(axis=0)


models = [
    ('Word2Vec SG', word2vec_sg),
    ('Word2Vec CBOW', word2vec_cbow),
    ('FastText', fasttext_model)
]

X_train_vectors = {}
X_test_vectors = {}

for name, model in models:
    X_train_vectors[name] = np.array([sentence_vector(sentence, model) for sentence in tqdm(X_train_preprocessed, desc=f'(Train) Vectorizing with {name}')])
    X_test_vectors[name] = np.array([sentence_vector(sentence, model) for sentence in tqdm(X_test_preprocessed, desc=f'(Test) Vectorizing with {name}')])

(Train) Vectorizing with Word2Vec SG: 100%|██████████| 2249/2249 [00:00<00:00, 2699.63it/s]
(Test) Vectorizing with Word2Vec SG: 100%|██████████| 750/750 [00:00<00:00, 2682.53it/s]
(Train) Vectorizing with Word2Vec CBOW: 100%|██████████| 2249/2249 [00:00<00:00, 2483.13it/s]
(Test) Vectorizing with Word2Vec CBOW: 100%|██████████| 750/750 [00:00<00:00, 2699.82it/s]
(Train) Vectorizing with FastText: 100%|██████████| 2249/2249 [00:00<00:00, 2609.67it/s]
(Test) Vectorizing with FastText: 100%|██████████| 750/750 [00:00<00:00, 1636.51it/s]


# Logistic Regression Training

In [14]:
model_data = [
    ('sg', X_train_vectors['Word2Vec SG'], X_test_vectors['Word2Vec SG']),
    ('cbow', X_train_vectors['Word2Vec CBOW'], X_test_vectors['Word2Vec CBOW']),
    ('ft', X_train_vectors['FastText'], X_test_vectors['FastText'])
]

accuracies = {}

for name, X_train, X_test in model_data:
    lr = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    accuracies[name] = accuracy_score(y_test, y_pred)

Output Statistics

In [15]:
print("Evaluation Statistics")
print("----------------------")

for name, accuracy in accuracies.items():
    print(f"{name} Accuracy: {accuracy * 100:.2f}%")

Evaluation Statistics
----------------------
sg Accuracy: 97.07%
cbow Accuracy: 96.27%
ft Accuracy: 95.87%


# Intrinsic evaluation

Intrinsic evaluation using most_similar and doesnt_match methods

In [16]:
models = [
    ('Word2Vec SkipGram', word2vec_sg),
    ('Word2Vec CBOW', word2vec_cbow),
    ('fastText', fasttext_model)
]

for name, model in models:
    print(f"Words most similar to 'email' according to {name}:")
    most_similar_words = model.wv.most_similar('email')
    for word, similarity in most_similar_words:
        print(f"  {word}: {similarity:.4f}")
    print("\n")

    doesnt_match_word = model.wv.doesnt_match('email spam inbox'.split())
    print(f"Word that doesn't match in the list ['email', 'spam', 'inbox'] according to {name}: {doesnt_match_word}")
    print("\n")

Words most similar to 'email' according to Word2Vec SkipGram:
  assignee: 0.8037
  watching: 0.7937
  sending: 0.7875
  strongly: 0.7853
  opt: 0.7840
  unsolicited: 0.7805
  tesrewinter: 0.7780
  dump: 0.7747
  sponsored: 0.7742
  obtained: 0.7738


Word that doesn't match in the list ['email', 'spam', 'inbox'] according to Word2Vec SkipGram: email


Words most similar to 'email' according to Word2Vec CBOW:
  unsolicited: 0.9645
  sponsored: 0.9605
  thinkgeek: 0.9584
  targeted: 0.9553
  unsubscribe: 0.9549
  remove: 0.9510
  address: 0.9475
  send: 0.9472
  reply: 0.9468
  responsevivek: 0.9463


Word that doesn't match in the list ['email', 'spam', 'inbox'] according to Word2Vec CBOW: email


Words most similar to 'email' according to fastText:
  nail: 0.9933
  sendmail: 0.9932
  rmail: 0.9927
  snail: 0.9925
  omail: 0.9923
  qmail: 0.9902
  msmail: 0.9896
  emailer: 0.9896
  bmail: 0.9895
  jail: 0.9895


Word that doesn't match in the list ['email', 'spam', 'inbox'] according to