In [12]:
import pandas as pd
import numpy as np
import re
import string
import spacy
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

!python3 -m spacy download en_core_web_sm

pd.set_option('max_colwidth', 400)

import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m75.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


**Предобработка текстов** (удаление пустых строк, знаков препинания и других символов; приведение к нижнему регистру)

In [2]:
df = pd.read_csv('spam_or_not_spam.csv', encoding='iso-8859-1')
df.dropna(inplace=True)
df.reset_index(drop=True)

df['email'] = df['email'].replace(r'[^\w\s]',' ',regex=True).replace(r'\s+',' ',regex=True).str.lower()
df['email'].head()

0     date wed number aug number number number number number from chris garrigues cwg dated number numberfanumberd deepeddy com message id number number tmda deepeddy vircio com i can t reproduce this error for me it is very repeatable like every time without fail this is the debug log of the pick happening number number number pick_it exec pick inbox list lbrace lbrace subject ftp rbrace rbrace nu...
1    martin a posted tassos papadopoulos the greek sculptor behind the plan judged that the limestone of mount kerdylio number miles east of salonika and not far from the mount athos monastic community was ideal for the patriotic sculpture as well as alexander s granite features number ft high and number ft wide a museum a restored amphitheatre and car park for admiring crowds are planned so is thi...
2    man threatens explosion in moscow thursday august number number number number pm moscow ap security officers on thursday seized an unidentified man who said he was armed with explosives

In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Токенизация, лемматизация и удаление стоп слов**

In [4]:
stop_words = set(stopwords.words('english'))

# Функция для удаления стоп-слов и лемматизации текста
def process_text(text):
    # Токенизация текста на отдельные слова
    tokens = nltk.word_tokenize(text)

    # Удаление стоп-слов
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Лемматизация токенов
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Склеивание лемматизированных токенов обратно в текст
    filtered_text = ' '.join(lemmatized_tokens)

    return filtered_text

# Применение функции к столбцу 'email'
df['email'] = df['email'].apply(process_text)
df.head()



Unnamed: 0,email,label
0,date wed number aug number number number number number chris garrigues cwg dated number numberfanumberd deepeddy com message id number number tmda deepeddy vircio com reproduce error repeatable like every time without fail debug log pick happening number number number pick_it exec pick inbox list lbrace lbrace subject ftp rbrace rbrace number number sequence mercury number number number exec p...,0
1,martin posted tasso papadopoulos greek sculptor behind plan judged limestone mount kerdylio number mile east salonika far mount athos monastic community ideal patriotic sculpture well alexander granite feature number ft high number ft wide museum restored amphitheatre car park admiring crowd planned mountain limestone granite limestone weather pretty fast yahoo group sponsor number dvd free p ...,0
2,man threatens explosion moscow thursday august number number number number pm moscow ap security officer thursday seized unidentified man said armed explosive threatened blow truck front russia federal security service headquarters moscow ntv television reported officer seized automatic rifle man carrying man got truck taken custody ntv said detail immediately available man demanded talk high ...,0
3,klez virus die already prolific virus ever klez continues wreak havoc andrew brandt september number issue pc world magazine posted thursday august number number klez worm approaching seventh month wriggling across web making one persistent virus ever expert warn may harbinger new virus use combination pernicious approach go pc pc antivirus software maker symantec mcafee report number new infe...,0
4,adding cream spaghetti carbonara effect pasta making pizza deep pie jump carbonara one favourite make ask hell supposed use instead cream never seen recipe used personally use low fat creme fraiche work quite nicely time seen supposedly authentic recipe carbonara identical mine cream egg lot fresh parmesan except creme fraiche stew stewart smith scottish microelectronics centre university edin...,0


In [49]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['email'], df['label'], random_state=2023)

In [50]:
count_vectorizer = CountVectorizer(max_df=0.7, min_df=0.003)
X_train_cv = count_vectorizer.fit_transform(X_train)
X_test_cv = count_vectorizer.transform(X_test)

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [61]:
data = pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

data['idf_weight'] = data.sum()
data = data[['idf_weight']].sort_values(by='idf_weight')

# Reset the index to use words as rows
data = data.reset_index()
data = data.rename(columns={'index': 'Words'})

# Display the DataFrame sorted by idf_weight
print(data)

      Words  idf_weight
0         0         NaN
1         1         NaN
2         2         NaN
3         3         NaN
4         4         NaN
...     ...         ...
2244   2244         NaN
2245   2245         NaN
2246   2246         NaN
2247   2247         NaN
2248   2248         NaN

[2249 rows x 2 columns]


In [54]:
pd.DataFrame(X_train_cv.toarray(), columns=count_vectorizer.get_feature_names_out()).head()

Unnamed: 0,_______________________________________________,__________________________________________________,_________________________________________________________________,_thanks,_via,_will_,aa,aaron,abandoned,ability,...,zero,ziggy,zip,zone,zope,zzzz,ªå,äº,ä½,å¾
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
from sklearn.metrics import classification_report

logreg_cv = LogisticRegression().fit(X_train_cv, y_train)
preds_cv = logreg_cv.predict(X_test_cv)
print(classification_report(y_test, preds_cv))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       616
           1       0.98      0.96      0.97       134

    accuracy                           0.99       750
   macro avg       0.98      0.98      0.98       750
weighted avg       0.99      0.99      0.99       750



In [37]:
logreg_tfidf = LogisticRegression().fit(X_train_tfidf, y_train)
preds_tfidf = logreg_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, preds_tfidf))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       616
           1       1.00      0.75      0.86       134

    accuracy                           0.96       750
   macro avg       0.97      0.88      0.92       750
weighted avg       0.96      0.96      0.95       750



In [18]:
decision_tree_cv = DecisionTreeClassifier(random_state=42)

Decision Tree cross-validation scores, cv=5: [0.94444444 0.95555556 0.94666667 0.93777778 0.94877506]
0.9466439000247464
Decision Tree cross-validation scores, cv=4: [0.94671403 0.95551601 0.9341637  0.95551601]
0.9479774403772369
Decision Tree cross-validation scores, cv=3: [0.94       0.968      0.94526035]
0.951086782376502


In [44]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

pipe = Pipeline(
    steps=[
        ('counter', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', LogisticRegression())
    ]
)

parameter_grid = {
    "counter__max_df": np.linspace(0.3, 0.7, 10),
    "counter__min_df": [0.0, 0.001, 0.003, 0.005],
    "counter__ngram_range": ((1, 1), (1, 2)),  # слова или биграммы
    "tfidf__norm": ("l1", "l2"),
    "clf__C": np.linspace(0.1, 1, 10),
}

grid_search = HalvingGridSearchCV(
    pipe,
    param_grid=parameter_grid,
    n_jobs=-1,
    verbose=1,
    cv=2,
    scoring='accuracy',
    random_state=42,
)
grid_search.fit(X_train, y_train)

n_iterations: 6
n_required_iterations: 7
n_possible_iterations: 6
min_resources_: 8
max_resources_: 2249
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1600
n_resources: 8
Fitting 2 folds for each of 1600 candidates, totalling 3200 fits


1600 fits failed out of a total of 3200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
160 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/usr/local/lib/python3.10/dist-packages/joblib/memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
  File "/usr/

----------
iter: 1
n_candidates: 534
n_resources: 24
Fitting 2 folds for each of 534 candidates, totalling 1068 fits




----------
iter: 2
n_candidates: 178
n_resources: 72
Fitting 2 folds for each of 178 candidates, totalling 356 fits




----------
iter: 3
n_candidates: 60
n_resources: 216
Fitting 2 folds for each of 60 candidates, totalling 120 fits




----------
iter: 4
n_candidates: 20
n_resources: 648
Fitting 2 folds for each of 20 candidates, totalling 40 fits




----------
iter: 5
n_candidates: 7
n_resources: 1944
Fitting 2 folds for each of 7 candidates, totalling 14 fits




In [19]:
decision_tree_cv.fit(X_train_cv, y_train)
decision_tree_cv_test_score = decision_tree_cv.score(X_test_cv, y_test)


print("Decision Tree Test Score:", decision_tree_cv_test_score)

Decision Tree Test Score: 0.9506666666666667


In [9]:
decision_tree_tfidf = DecisionTreeClassifier(random_state=42)

decision_tree_tfidf_scores = cross_val_score(decision_tree_tfidf, X_train_tfidf, y_train, cv=5)

decision_tree_tfidf.fit(X_train_tfidf, y_train)
decision_tree_tfidf_test_score = decision_tree_tfidf.score(X_test_tfidf, y_test)

print("Decision Tree Tf-idf Scores:", decision_tree_tfidf_scores)

print("Decision Tree Tf-idf Test Score:", decision_tree_tfidf_test_score)


Decision Tree Tf-idf Scores: [0.95333333 0.94444444 0.95111111 0.94       0.95768374]
Decision Tree Tf-idf Test Score: 0.952
