In [1]:
import pandas as pd
import numpy as np
import re
import string
import spacy
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

!python3 -m spacy download en_core_web_sm

pd.set_option('max_colwidth', 400)

import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


**Предобработка текстов** (удаление пустых строк, знаков препинания и других символов; приведение к нижнему регистру)

In [2]:
df = pd.read_csv('spam_or_not_spam.csv', encoding='iso-8859-1')
df.dropna(inplace=True)
df.reset_index(drop=True)

df['email'] = df['email'].replace(r'[^\w\s]',' ',regex=True).replace(r'\s+',' ',regex=True).str.lower()
df['email'].head()

0     date wed number aug number number number number number from chris garrigues cwg dated number numberfanumberd deepeddy com message id number number tmda deepeddy vircio com i can t reproduce this error for me it is very repeatable like every time without fail this is the debug log of the pick happening number number number pick_it exec pick inbox list lbrace lbrace subject ftp rbrace rbrace nu...
1    martin a posted tassos papadopoulos the greek sculptor behind the plan judged that the limestone of mount kerdylio number miles east of salonika and not far from the mount athos monastic community was ideal for the patriotic sculpture as well as alexander s granite features number ft high and number ft wide a museum a restored amphitheatre and car park for admiring crowds are planned so is thi...
2    man threatens explosion in moscow thursday august number number number number pm moscow ap security officers on thursday seized an unidentified man who said he was armed with explosives

In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Токенизация, лемматизация и удаление стоп слов**

In [4]:
stop_words = set(stopwords.words('english'))

# Функция для удаления стоп-слов и лемматизации текста
def process_text(text):
    # Токенизация текста на отдельные слова
    tokens = nltk.word_tokenize(text)

    # Удаление стоп-слов
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Лемматизация токенов
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Склеивание лемматизированных токенов обратно в текст
    filtered_text = ' '.join(lemmatized_tokens)

    return filtered_text

# Применение функции к столбцу 'email'
df['email'] = df['email'].apply(process_text)
df.head()



Unnamed: 0,email,label
0,date wed number aug number number number number number chris garrigues cwg dated number numberfanumberd deepeddy com message id number number tmda deepeddy vircio com reproduce error repeatable like every time without fail debug log pick happening number number number pick_it exec pick inbox list lbrace lbrace subject ftp rbrace rbrace number number sequence mercury number number number exec p...,0
1,martin posted tasso papadopoulos greek sculptor behind plan judged limestone mount kerdylio number mile east salonika far mount athos monastic community ideal patriotic sculpture well alexander granite feature number ft high number ft wide museum restored amphitheatre car park admiring crowd planned mountain limestone granite limestone weather pretty fast yahoo group sponsor number dvd free p ...,0
2,man threatens explosion moscow thursday august number number number number pm moscow ap security officer thursday seized unidentified man said armed explosive threatened blow truck front russia federal security service headquarters moscow ntv television reported officer seized automatic rifle man carrying man got truck taken custody ntv said detail immediately available man demanded talk high ...,0
3,klez virus die already prolific virus ever klez continues wreak havoc andrew brandt september number issue pc world magazine posted thursday august number number klez worm approaching seventh month wriggling across web making one persistent virus ever expert warn may harbinger new virus use combination pernicious approach go pc pc antivirus software maker symantec mcafee report number new infe...,0
4,adding cream spaghetti carbonara effect pasta making pizza deep pie jump carbonara one favourite make ask hell supposed use instead cream never seen recipe used personally use low fat creme fraiche work quite nicely time seen supposedly authentic recipe carbonara identical mine cream egg lot fresh parmesan except creme fraiche stew stewart smith scottish microelectronics centre university edin...,0


In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['email'], df['label'], random_state=2023)

**Векторизация**

In [17]:
count_vectorizer = CountVectorizer(max_df=0.7, min_df=0.003)
X_train_cv = count_vectorizer.fit_transform(X_train)
X_test_cv = count_vectorizer.transform(X_test)

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

data = {'CountVectorizer': [X_train_cv, X_test_cv],
        'TfidfVectorizrer': [X_train_tfidf, X_test_tfidf]}

**Функция для подбора лучших гиперпараметров для каждой из векторизаций**

In [24]:
from sklearn.metrics import classification_report

def test_vectorizers(grid):
  for vectorizer in data:
    print('Vectorizer: ', vectorizer)
    model_grid = grid.fit(data[vectorizer][0], y_train)

    print(model_grid.best_params_)

    best_model = model_grid.best_estimator_

    y_pred = best_model.predict(data[vectorizer][1])
    print(classification_report(y_test, y_pred))


**Decision Tree**

In [25]:
decision_tree = DecisionTreeClassifier()

max_depth = [2,4,6,8,10,12]

grid_space={'max_depth':max_depth,
            'min_samples_split': [7, 5, 10, 15]
           }

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(decision_tree,param_grid=grid_space)
test_vectorizers(grid)


Vectorizer:  count
{'criterion': 'entropy', 'max_depth': 12, 'min_samples_split': 5}
DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_split=5,
                       random_state=42)
              precision    recall  f1-score   support

           0       0.98      0.97      0.97       616
           1       0.86      0.90      0.88       134

    accuracy                           0.96       750
   macro avg       0.92      0.93      0.93       750
weighted avg       0.96      0.96      0.96       750

Vectorizer:  tfidf
{'criterion': 'entropy', 'max_depth': 12, 'min_samples_split': 10}
DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_split=10,
                       random_state=42)
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       616
           1       0.83      0.89      0.86       134

    accuracy                           0.95       750
   macro avg       0.90      0.92      0.91  

**Logistic Regression**

In [40]:
logreg = LogisticRegression()

grid_space={
    'C': [100, 10, 1.0, 0.1],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'lbfgs']}

grid = GridSearchCV(logreg, param_grid=grid_space)
test_vectorizers(grid)

Vectorizer:  count


20 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

 0.98976986 0.98976986 0.98621133     

{'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
LogisticRegression(C=10, solver='liblinear')
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       616
           1       0.99      0.96      0.97       134

    accuracy                           0.99       750
   macro avg       0.99      0.98      0.98       750
weighted avg       0.99      0.99      0.99       750

Vectorizer:  tfidf
{'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
LogisticRegression(C=100, penalty='l1', solver='liblinear')
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       616
           1       0.96      0.94      0.95       134

    accuracy                           0.98       750
   macro avg       0.97      0.97      0.97       750
weighted avg       0.98      0.98      0.98       750



20 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

 0.98176194 0.9813175  0.95464093     

**Multionomial NB**

In [41]:
nb= MultinomialNB()

grid_space={
    'alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001) }

grid = GridSearchCV(nb, param_grid=grid_space)
test_vectorizers(grid)

Vectorizer:  count
{'alpha': 0.1}
MultinomialNB(alpha=0.1)
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       616
           1       0.97      0.96      0.97       134

    accuracy                           0.99       750
   macro avg       0.98      0.98      0.98       750
weighted avg       0.99      0.99      0.99       750

Vectorizer:  tfidf
{'alpha': 0.01}
MultinomialNB(alpha=0.01)
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       616
           1       0.99      0.92      0.95       134

    accuracy                           0.98       750
   macro avg       0.99      0.96      0.97       750
weighted avg       0.98      0.98      0.98       750

