# Dependency Installation and Repository Cloning

### Run it if you're using this notebook in Google Colab

In [None]:
!git clone 'https://github.com/dakopecky/nlp-course-itmo.git'

%cd nlp-course-itmo
!git checkout hw2
%cd hw2

!pip install poetry
!poetry config virtualenvs.create false
!poetry install --no-ansi

# From Text to Vectors: Data Transformation and Model Fitting

Import deps

In [1]:
# This code includes software developed by the following open-source projects:
# - Pandas (License: BSD-3-Clause License, Authors: Pandas Development Team)
# - tqdm (License: MIT License, Authors: Noam Yorav-Raphael)
# - scikit-learn (License: BSD License, Authors: scikit-learn Developers)
# - Jupyter Notebook (License: Modified BSD License, Authors: Project Jupyter)
# For the full license information, please see the `licenses` directory.


import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

Download the "Spam or not spam" dataset

In [35]:
!wget -q 'https://www.dropbox.com/scl/fi/8xeu3og5umjc7hfalrntu/spam_or_not_spam.csv?rlkey=utzaie3ti891ba80pbaryefzx&dl=1' -O 'spam-or-not-spam.csv'

Load the dataset

In [36]:
dataset_path = 'spam-or-not-spam.csv'
df = pd.read_csv(dataset_path)
df.dropna(inplace=True)

Ensure reproducibility

In [37]:
RANDOM_STATE = 42

Split the dataset into training and test sets

In [38]:
X = df['email']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)

Initialize vectorizers

In [39]:
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

Fit and transform the training data

In [40]:
X_train_count = count_vectorizer.fit_transform(X_train)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

Transform the test data

In [41]:
X_test_count = count_vectorizer.transform(X_test)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

Initialize classifiers

In [42]:
models = {
    'DecisionTree': DecisionTreeClassifier(random_state=RANDOM_STATE),
    'LogisticRegression': LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
    'NaiveBayes': MultinomialNB()
}

Parameters for GridSearchCV

In [43]:
params = {
    'DecisionTree': {'max_depth': [10, 20, 30]},
    'LogisticRegression': {'C': [0.1, 1, 10]},
    'NaiveBayes': {'alpha': [0.1, 1, 10]}
}

Train models and tune hyperparameters

In [44]:
results = {}

# Loop through models
for name in tqdm(models.keys(), desc='Models', unit='model'):
    model = models[name]
    grid_count = GridSearchCV(model, params[name], cv=5)
    grid_tfidf = GridSearchCV(model, params[name], cv=5)

    grid_count.fit(X_train_count, y_train)
    acc_train_count = accuracy_score(y_train, grid_count.predict(X_train_count))
    acc_test_count = accuracy_score(y_test, grid_count.predict(X_test_count))

    grid_tfidf.fit(X_train_tfidf, y_train)
    acc_train_tfidf = accuracy_score(y_train, grid_tfidf.predict(X_train_tfidf))
    acc_test_tfidf = accuracy_score(y_test, grid_tfidf.predict(X_test_tfidf))

    results[name] = {
        'Train CountVectorizer': acc_train_count,
        'Test CountVectorizer': acc_test_count,
        'Train TfidfVectorizer': acc_train_tfidf,
        'Test TfidfVectorizer': acc_test_tfidf,
        'Best Params CountVectorizer': grid_count.best_params_,
        'Best Params TfidfVectorizer': grid_tfidf.best_params_
    }

Models:   0%|          | 0/3 [00:00<?, ?model/s]

Output the results in a readable format

In [45]:
for name, metrics in results.items():
    print(f"Model: {name}")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")
    print("\n")

Model: DecisionTree
Train CountVectorizer: 0.9968875055580257
Test CountVectorizer: 0.9693333333333334
Train TfidfVectorizer: 0.9977767896843042
Test TfidfVectorizer: 0.976
Best Params CountVectorizer: {'max_depth': 20}
Best Params TfidfVectorizer: {'max_depth': 20}


Model: LogisticRegression
Train CountVectorizer: 0.9986660738105825
Test CountVectorizer: 0.9933333333333333
Train TfidfVectorizer: 0.9968875055580257
Test TfidfVectorizer: 0.9866666666666667
Best Params CountVectorizer: {'C': 1}
Best Params TfidfVectorizer: {'C': 10}


Model: NaiveBayes
Train CountVectorizer: 0.9968875055580257
Test CountVectorizer: 0.9946666666666667
Train TfidfVectorizer: 0.9933303690529124
Test TfidfVectorizer: 0.9826666666666667
Best Params CountVectorizer: {'alpha': 0.1}
Best Params TfidfVectorizer: {'alpha': 0.1}
