In [45]:
import pandas as pd
import numpy as np
import sklearn
import itertools
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve, f1_score, precision_recall_curve, classification_report, confusion_matrix

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dassshark\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dassshark\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dassshark\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
df = pd.read_csv('data.csv')

In [5]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.info('data.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:
df["Category"]= [1 if i == 'spam' else 0 for i in df['Category']]
#df["Category"] = np.where(df["Category"].str.contains("spam"), 1, 0)

## Подготовка текста к обучению

In [8]:
lemmatizer = WordNetLemmatizer()
rows = []
for row in df['Message']:
    row = re.sub("[^a-zA-Z]"," ",row)
    row = row.lower()
    row = nltk.word_tokenize(row)
    row = [word for word in row if not word in stopwords.words("english")]
    row = ' '.join([lemmatizer.lemmatize(word) for word in row])
    rows.append(row)

In [9]:
target = df['Category']
features = rows

## Разбивка на выборки и векторизация

In [10]:
features_train, features_valid, target_train, target_valid = train_test_split(features, target, test_size=0.25, random_state=12345)
features_valid, features_test, target_valid, target_test = train_test_split(features_valid, target_valid, test_size=0.5, random_state=12345)

In [15]:
hv = HashingVectorizer(n_features = 256, norm = None)
train_hv = hv.fit_transform(features_train).toarray()
test_hv = hv.transform(features_test).toarray()
valid_hv = hv.transform(features_valid).toarray()

In [25]:
cv = CountVectorizer()
train_cv = cv.fit_transform(features_train).toarray()
test_cv = cv.transform(features_test).toarray()
valid_cv = cv.transform(features_valid).toarray()

In [60]:
tfidf_vect= TfidfVectorizer()

## Логистическая регрессия и ее метрики

In [69]:
lgr = LogisticRegression(class_weight='balanced')
lgr = lgr.fit(train_cv, target_train)
pred = lgr.predict(valid_cv)

In [30]:
print(accuracy_score(target_valid, pred),
precision_score(target_valid, pred),
recall_score(target_valid, pred),
f1_score(target_valid, pred))

0.9798850574712644 0.9340659340659341 0.9139784946236559 0.9239130434782609


In [18]:
print(accuracy_score(target_valid, pred),
precision_score(target_valid, pred),
recall_score(target_valid, pred),
f1_score(target_valid, pred))

0.9281609195402298 0.6747967479674797 0.8924731182795699 0.7685185185185185


In [21]:
report = classification_report(target_valid, lgr.predict(valid_hv), target_names=['Non-churned', 'Churned'])
print(report)

              precision    recall  f1-score   support

 Non-churned       0.98      0.93      0.96       603
     Churned       0.67      0.89      0.77        93

    accuracy                           0.93       696
   macro avg       0.83      0.91      0.86       696
weighted avg       0.94      0.93      0.93       696



## Проверка сообщения

In [67]:
def check_message(mess: str, model):
    if model.predict(tfidf_vect.transform([mess])) == 0:
        return False
    else:
        return True