### Импорт модулей

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from string import punctuation
from pymystem3 import Mystem
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import classification_report, f1_score
import xgboost
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFE

In [2]:
pd.set_option('max_colwidth', None)

In [3]:
data = pd.read_csv('data.csv')

In [4]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


### Очистка текста

In [5]:
def preprocess_text(text):
    text = re.sub(r'[0-9]+|\W+',' ', str(text))
    #tokens = Mystem().lemmatize(text.lower())
    tokens = [token for token in text.split() if token not in stopwords.words('english')\
              and token != " " \
              and token.strip() not in punctuation]
    text = " ".join(tokens)
    return text

In [6]:
clean_text = data['Message'].apply(preprocess_text)

### Создание мешка слов

In [7]:
def evaluate_model(X,y,clf):
    
    scores = np.mean(cross_val_score(clf, X, y, cv = StratifiedKFold(n_splits = 5), scoring = 'f1_macro'))
        
    return np.mean(scores) 

In [8]:
CV = CountVectorizer(ngram_range=(1,2))
X_cv = CV.fit_transform(clean_text)

TFIDF = TfidfVectorizer(ngram_range=(1,2))
X_tfidf = TFIDF.fit_transform(clean_text)

y = data.Category

In [9]:
clf = XGBClassifier()
print('CV - ', evaluate_model(X_cv, y, clf))
print('TF-IDF - ', evaluate_model(X_tfidf, y, clf))

CV -  0.9489322505405695
TF-IDF -  0.9468872197628316


У CountVectorizer показатель выше, будем использовать его

### Выбор модели

In [10]:
log = LogisticRegression()
dec = DecisionTreeClassifier()
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()

In [11]:
solvers = {'XGBClassifier' : clf,  \
           'LogisticRegression' : log, \
           'DecisionTreeClassifier' : dec, \
           'RandomForestClassifier' : rf, \
           'GradientBoostingClassifier' : gb}

In [12]:
for k in solvers:
    print(k, evaluate_model(X_cv, y, solvers[k]))

XGBClassifier 0.9489322505405695
LogisticRegression 0.9526600393654036
DecisionTreeClassifier 0.9301894843588286
RandomForestClassifier 0.9234124872262338
GradientBoostingClassifier 0.9187873836297993


Выбираем LogisticRegression

### Подбор параметров

In [13]:
grid = {'C' : np.logspace(-3,3,7), \
        'penalty' : ['l1', 'l2'], \
        'solver' : ['liblinear']}

In [14]:
log_grid = GridSearchCV(log ,grid ,cv = 5)

In [15]:
X_train , X_test, y_train, y_test = train_test_split(X_cv, y, test_size = 0.15, stratify = y)

In [16]:
log_grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['l1', 'l2'], 'solver': ['liblinear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [17]:
log_grid.best_params_

{'C': 1000.0, 'penalty': 'l1', 'solver': 'liblinear'}

In [18]:
log_params = LogisticRegression(C = 1000, penalty = 'l1', solver = 'liblinear', random_state = 27)

In [19]:
print(evaluate_model(X_cv, y, log_params))

0.9627496179922599


### Подбор количества фичей

In [20]:
n_features = [1000*i for i in range(10, 41)]

In [21]:
feature_scores = {}

for n in n_features: 
    
    CV_feat = CountVectorizer(ngram_range=(1,2), max_features = n)
    X_cv_feat = CV_feat.fit_transform(clean_text)
    
    feature_scores[n] = evaluate_model(X_cv_feat, y, log_params)
    #print(n, evaluate_model(X_cv_feat, y, log_params))

In [22]:
n = max(feature_scores, key = feature_scores.get)
print(n)

38000


### Предикт на оптимальных параметрах

vectorizer - CountVectorizer c ngram_range=(1,2) <br>

classifier - LogisticRegression <br>

parameters - {'C': 1000.0, 'penalty': 'l1', 'solver': 'liblinear'} <br>

n_features - 38000

In [23]:
CV_pred = CountVectorizer(ngram_range=(1,2), max_features = n)
X_cv_pred = CV.fit_transform(clean_text)

log_params = LogisticRegression(C = 1000, penalty = 'l1', solver = 'liblinear', random_state = 27)

X_train , X_test, y_train, y_test = train_test_split(X_cv_pred, y, test_size = 0.15, stratify = y, random_state = 27)

log_params.fit(X_train, y_train)
y_pred = log_params.predict(X_test)


print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       724
        spam       0.97      0.87      0.92       112

    accuracy                           0.98       836
   macro avg       0.97      0.93      0.95       836
weighted avg       0.98      0.98      0.98       836

