In [75]:
import re
import string
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.feature_extraction.text import TfidfVectorizer
from skfeature.function.similarity_based import fisher_score
from sklearn.feature_selection import chi2, SelectKBest, SequentialFeatureSelector, SelectFromModel

In [76]:
df = pd.read_csv('data/SMS.tsv', sep='\t')
df.head(10)

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [77]:
def ref_str(s):
    s = re.sub('\d+', '', s)
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = re.sub('\s+', ' ', s)
    return s

df['class'] = pd.factorize(df['class'])[0]
text = []
for s in df['text']:
    text.append(ref_str(s))
    
n = 1000
vectorizer = TfidfVectorizer(max_features = n, stop_words=stopwords.words('english'))
X = pd.DataFrame(vectorizer.fit_transform(text).toarray(), columns=vectorizer.get_feature_names_out())
X_train, X_test, y_train, y_test = train_test_split(X, df['class'])
# n = X.shape[1]

Встроенный

In [78]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [79]:
res_rfc = pd.Series(rf.feature_importances_, X_train.columns).sort_values(ascending=False)
res_rfc

call        0.047344
txt         0.045999
free        0.045676
claim       0.038052
mobile      0.026863
              ...   
short       0.000000
hospital    0.000000
bathe       0.000000
mah         0.000000
studying    0.000000
Length: 1000, dtype: float64

обёртка

In [80]:
lr = LogisticRegression()

nof = 30
res_wrap = []
X_train_seq = X_train
while len(res_wrap) < nof:
    lr.fit(X_train_seq, y_train)
    model_fi = permutation_importance(lr, X_train_seq, y_train)
    max_value = max(model_fi['importances_mean'])
    max_index = np.argmax(model_fi['importances_mean'])
    print('Step ', (len(res_wrap) + 1), ', added ', X_train_seq.columns[max_index])
    res_wrap.append(X_train_seq.columns[max_index])
    X_train_seq = X_train_seq.drop(X_train_seq.columns[max_index], axis=1)

Step  1 , added  txt
Step  2 , added  stop
Step  3 , added  free
Step  4 , added  text
Step  5 , added  win
Step  6 , added  call
Step  7 , added  reply
Step  8 , added  mobile
Step  9 , added  claim
Step  10 , added  service
Step  11 , added  ur
Step  12 , added  new
Step  13 , added  nokia
Step  14 , added  customer
Step  15 , added  tone
Step  16 , added  prize
Step  17 , added  per
Step  18 , added  urgent
Step  19 , added  contact
Step  20 , added  im
Step  21 , added  admirer
Step  22 , added  secret
Step  23 , added  guaranteed
Step  24 , added  cash
Step  25 , added  please
Step  26 , added  attempt
Step  27 , added  ltgt
Step  28 , added  ok
Step  29 , added  ringtone
Step  30 , added  awarded


фильтр

In [81]:
X_train_filter = X_train
corr = X_train_filter.corr(method='pearson')['class']
res_filt = corr.sort_values(ascending=False)
res_filt

class     1.000000
whos      0.097243
coming    0.093596
go        0.088357
hours     0.063223
            ...   
come     -0.014501
free     -0.014744
ill      -0.015255
ur       -0.016921
brings         NaN
Name: class, Length: 1000, dtype: float64

In [82]:
print('Встроенный: ', np.array(res_rfc.index[:30]))
print('Oбертка: ', np.array(res_wrap))
print('Фильтр: ', np.array(res_filt.index[:30]))

Встроенный:  ['call' 'txt' 'free' 'claim' 'mobile' 'stop' 'reply' 'text' 'prize' 'win'
 'urgent' 'service' 'tone' 'nokia' 'box' 'chat' 'cash' 'awarded'
 'customer' 'ringtone' 'pmin' 'per' 'new' 'guaranteed' 'mins' 'landline'
 'contact' 'pobox' 'apply' 'video']
Oбертка:  ['txt' 'stop' 'free' 'text' 'win' 'call' 'reply' 'mobile' 'claim'
 'service' 'ur' 'new' 'nokia' 'customer' 'tone' 'prize' 'per' 'urgent'
 'contact' 'im' 'admirer' 'secret' 'guaranteed' 'cash' 'please' 'attempt'
 'ltgt' 'ok' 'ringtone' 'awarded']
Фильтр:  ['class' 'whos' 'coming' 'go' 'hours' 'ugh' 'voucher' 'finished' 'todays'
 'day' 'done' 'plz' 'using' 'saw' 'weed' 'theres' 'shower' 'came' 'water'
 'carlos' 'yeah' 'update' 'run' 'smoke' 'doesnt' 'right' 'sleep' 'st'
 'first' 'much']


Фильтр (фишер)

In [83]:
X_train_fish = X_train
ranks = fisher_score.fisher_score(X_train_fish.to_numpy(), y_train.to_numpy())
feature_importances = pd.Series(ranks, X_train_fish.columns)
res_fish = feature_importances.sort_values(ascending=True)[:30].index
res_fish

Index(['haha', 'expires', 'collect', 'phone', 'happy', 'trying', 'street',
       'heard', 'join', 'wit', 'yr', 'shall', 'starting', 'thought',
       'especially', 'promise', 'hiya', 'smile', 'kinda', 'forwarded', 'date',
       'cd', 'right', 'questions', 'wil', 'hold', 'loving', 'disturb', 'lect',
       'sea'],
      dtype='object')

Обертка (рекурсивное исключение)

In [84]:
X_train_rec = X_train
lr = LogisticRegression(class_weight = 'balanced')
rfe = RFE(lr, n_features_to_select=30)
rfe = rfe.fit(X_train_rec, y_train)

In [85]:
res_rec = X_train_rec.iloc[ : ,rfe.get_support()]
res_rec.columns

Index(['apply', 'awarded', 'box', 'call', 'cash', 'chat', 'claim', 'free',
       'ill', 'im', 'ltgt', 'mobile', 'nokia', 'ok', 'per', 'pmin', 'pobox',
       'prize', 'reply', 'ringtone', 'send', 'service', 'specialcall',
       'statement', 'stop', 'text', 'tones', 'txt', 'urgent', 'win'],
      dtype='object')

Встроенный (регуляризация)

In [86]:
X_train_emb = X_train
select = SelectFromModel(LogisticRegression(penalty='l1',solver='liblinear', random_state=9))
select.fit(X_train_emb, y_train)
res_emb = X_train_emb.iloc[ : , select.get_support()]
res_emb.columns

Index(['access', 'admirer', 'apply', 'attempt', 'auction', 'award', 'awarded',
       'box', 'call', 'calls',
       ...
       'waiting', 'wap', 'way', 'weeks', 'welcome', 'win', 'wk', 'word',
       'xmas', 'xx'],
      dtype='object', length=132)

In [87]:
print('Мои: ')
print(sorted(np.array(res_rfc.index[-30:])))
print(sorted(np.array(res_wrap)))
print(sorted(np.array(res_filt.index[:30])))
print('Библиотечные: ')
print(sorted(np.array(res_fish)))
print(sorted(np.array(res_rec.columns)))
print(sorted(np.array(res_emb.columns[:30])))

Мои: 
['ass', 'bathe', 'brings', 'callertune', 'catch', 'cut', 'empty', 'gal', 'gods', 'ha', 'hospital', 'kind', 'lazy', 'leaves', 'lect', 'loved', 'loving', 'mah', 'mrt', 'mu', 'outside', 'pizza', 'police', 'sch', 'short', 'studying', 'train', 'wishes', 'woke', 'worried']
['admirer', 'attempt', 'awarded', 'call', 'cash', 'claim', 'contact', 'customer', 'free', 'guaranteed', 'im', 'ltgt', 'mobile', 'new', 'nokia', 'ok', 'per', 'please', 'prize', 'reply', 'ringtone', 'secret', 'service', 'stop', 'text', 'tone', 'txt', 'ur', 'urgent', 'win']
['came', 'carlos', 'class', 'coming', 'day', 'doesnt', 'done', 'finished', 'first', 'go', 'hours', 'much', 'plz', 'right', 'run', 'saw', 'shower', 'sleep', 'smoke', 'st', 'theres', 'todays', 'ugh', 'update', 'using', 'voucher', 'water', 'weed', 'whos', 'yeah']
Библиотечные: 
['cd', 'collect', 'date', 'disturb', 'especially', 'expires', 'forwarded', 'haha', 'happy', 'heard', 'hiya', 'hold', 'join', 'kinda', 'lect', 'loving', 'phone', 'promise', 'quest

In [88]:
classifiers = {
    'LogisticRegression': LogisticRegression(), 
    'Knn': KNeighborsClassifier(n_neighbors=40), 
    'RandomForestClassifier': RandomForestClassifier()
}

X_train_in_emb = pd.DataFrame(X_train, columns = np.array(res_rfc.index[-30:]))
X_train_wrap = pd.DataFrame(X_train, columns = np.array(res_wrap))
X_train_filt = pd.DataFrame(X_train, columns = np.array(res_filt.index[-30:-1])[::-1])
X_train_fish = pd.DataFrame(X_train, columns = np.array(res_fish))
X_train_rec = pd.DataFrame(X_train, columns = np.array(res_rec.columns))
X_train_res_emb = pd.DataFrame(X_train, columns = np.array(res_emb.columns))

X_test_in_emb = pd.DataFrame(X_test, columns = np.array(res_rfc.index[-30:]))
X_test_wrap = pd.DataFrame(X_test, columns = np.array(res_wrap))
X_test_filt = pd.DataFrame(X_test, columns = np.array(res_filt.index[-30:-1])[::-1])
X_test_fish = pd.DataFrame(X_test, columns = np.array(res_fish))
X_test_rec = pd.DataFrame(X_test, columns = np.array(res_rec.columns))
X_test_res_emb = pd.DataFrame(X_test, columns = np.array(res_emb.columns))

In [89]:
def print_res(c_name, classifier):
    print(c_name)
    
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test) 
    print('До выбора признаков: ', accuracy_score(y_test, y_pred))
    
    print('После выбора признаков')
    
    classifier.fit(X_train_in_emb, y_train)
    y_pred = classifier.predict(X_test_in_emb) 
    print('Random Forest: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_wrap, y_train)
    y_pred = classifier.predict(X_test_wrap) 
    print('Wrap (sequential): ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_filt, y_train)
    y_pred = classifier.predict(X_test_filt) 
    print('Correlation: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_fish, y_train)
    y_pred = classifier.predict(X_test_fish) 
    print('Фишер: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_rec, y_train)
    y_pred = classifier.predict(X_test_rec) 
    print('Рекурсивное исключение: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_res_emb, y_train)
    y_pred = classifier.predict(X_test_res_emb) 
    print('Регуляризация: ', accuracy_score(y_test, y_pred))
    
    print()
    

In [90]:
for cn, cl in classifiers.items():
    print_res(cn, cl)

LogisticRegression
До выбора признаков:  0.9770279971284996
После выбора признаков
Random Forest:  0.8779612347451543
Wrap (sequential):  0.9310839913854989
Correlation:  0.9016511127063891
Фишер:  0.87724335965542
Рекурсивное исключение:  0.9339554917444365
Регуляризация:  0.9612347451543432

Knn
До выбора признаков:  0.8779612347451543
После выбора признаков
Random Forest:  0.8779612347451543
Wrap (sequential):  0.9260588657573582
Correlation:  0.9325197415649676
Фишер:  0.8801148600143575
Рекурсивное исключение:  0.9339554917444365
Регуляризация:  0.8880114860014358

RandomForestClassifier
До выбора признаков:  0.9813352476669059
После выбора признаков
Random Forest:  0.8779612347451543
Wrap (sequential):  0.9569274946159368
Correlation:  0.9511844938980617
Фишер:  0.8930366116295765
Рекурсивное исключение:  0.9562096195262024
Регуляризация:  0.9748743718592965

