In [21]:
import re
import string
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.feature_extraction.text import TfidfVectorizer
from skfeature.function.similarity_based import fisher_score
from sklearn.feature_selection import chi2, SelectKBest, SequentialFeatureSelector, SelectFromModel

In [22]:
df = pd.read_csv('data/SMS.tsv', sep='\t')
df.head(10)

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [23]:
def ref_str(s):
    s = re.sub('\d+', '', s)
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = re.sub('\s+', ' ', s)
    return s

df['class'] = pd.factorize(df['class'])[0]
text = []
for s in df['text']:
    text.append(ref_str(s))
    
n = 400
vectorizer = TfidfVectorizer(max_features=n, stop_words=stopwords.words('english'))
X = pd.DataFrame(vectorizer.fit_transform(text).toarray(), columns=vectorizer.get_feature_names_out())
X_train, X_test, y_train, y_test = train_test_split(X, df['class'])

Встроенный

In [24]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [25]:
res_rfc = pd.Series(rf.feature_importances_, X_train.columns).sort_values(ascending=False)
res_rfc

call      8.153658e-02
txt       5.286733e-02
free      4.208282e-02
claim     4.147199e-02
mobile    3.041530e-02
              ...     
fuck      1.259345e-06
den       7.713588e-07
pain      1.837305e-07
making    1.031264e-07
anyway    8.341038e-08
Length: 400, dtype: float64

обёртка

In [26]:
lr = LogisticRegression()

nof = 30
res_wrap = []
X_train_seq = X_train
while len(res_wrap) < nof:
    lr.fit(X_train_seq, y_train)
    model_fi = permutation_importance(lr, X_train_seq, y_train)
    max_value = max(model_fi['importances_mean'])
    max_index = np.argmax(model_fi['importances_mean'])
    print('Step ', (len(res_wrap) + 1), ', added ', X_train_seq.columns[max_index])
    res_wrap.append(X_train_seq.columns[max_index])
    X_train_seq = X_train_seq.drop(X_train_seq.columns[max_index], axis=1)

Step  1 , added  call
Step  2 , added  txt
Step  3 , added  stop
Step  4 , added  claim
Step  5 , added  mobile
Step  6 , added  free
Step  7 , added  text
Step  8 , added  reply
Step  9 , added  service
Step  10 , added  ur
Step  11 , added  win
Step  12 , added  new
Step  13 , added  chat
Step  14 , added  play
Step  15 , added  nokia
Step  16 , added  latest
Step  17 , added  orange
Step  18 , added  customer
Step  19 , added  tone
Step  20 , added  pobox
Step  21 , added  contact
Step  22 , added  per
Step  23 , added  ringtone
Step  24 , added  prize
Step  25 , added  guaranteed
Step  26 , added  code
Step  27 , added  mob
Step  28 , added  box
Step  29 , added  po
Step  30 , added  cash


фильтр

In [27]:
X_train_filter = X_train
corr = X_train_filter.corr(method='pearson')['class']
res_filt = corr.sort_values(ascending=False)
res_filt

class     1.000000
todays    0.043692
update    0.041912
early     0.040142
came      0.039478
            ...   
need     -0.013145
send     -0.013920
good     -0.015184
free     -0.015370
ur       -0.016886
Name: class, Length: 400, dtype: float64

In [28]:
print('Встроенный: ', np.array(res_rfc.index[:30]))
print('Oбертка: ', np.array(res_wrap))
print('Фильтр: ', np.array(res_filt.index[:30]))

Встроенный:  ['call' 'txt' 'free' 'claim' 'mobile' 'text' 'reply' 'stop' 'win' 'prize'
 'service' 'contact' 'nokia' 'box' 'customer' 'ringtone' 'chat' 'urgent'
 'tone' 'new' 'cash' 'guaranteed' 'orange' 'ur' 'pobox' 'message' 'tones'
 'send' 'latest' 'cost']
Oбертка:  ['call' 'txt' 'stop' 'claim' 'mobile' 'free' 'text' 'reply' 'service' 'ur'
 'win' 'new' 'chat' 'play' 'nokia' 'latest' 'orange' 'customer' 'tone'
 'pobox' 'contact' 'per' 'ringtone' 'prize' 'guaranteed' 'code' 'mob'
 'box' 'po' 'cash']
Фильтр:  ['class' 'todays' 'update' 'early' 'came' 'go' 'able' 'coming' 'st' 'saw'
 'join' 'theres' 'run' 'yeah' 'doesnt' 'later' 'ltgt' 'plz' 'minutes'
 'day' 'sleep' 'holiday' 'whats' 'yo' 'hi' 'might' 'went' 'much' 'name'
 'got']


Фильтр (фишер)

In [29]:
X_train_fish = X_train
ranks = fisher_score.fisher_score(X_train_fish.to_numpy(), y_train.to_numpy())
feature_importances = pd.Series(ranks, X_train_fish.columns)
res_fish = feature_importances.sort_values(ascending=True)[:30].index
res_fish

Index(['line', 'kiss', 'may', 'latest', 'use', 'wanna', 'true', 'wk', 'god',
       'min', 'liao', 'entry', 'later', 'us', 'st', 'orange', 'ill', 'coming',
       'stop', 'da', 'send', 'much', 'around', 'sleep', 'sexy', 'sir', 'id',
       'shit', 'haf', 'best'],
      dtype='object')

Обертка (рекурсивное исключение)

In [46]:
X_train_rec = X_train
# lr = LogisticRegression(class_weight = 'balanced', solver = 'lbfgs', random_state=42, n_jobs=-1, max_iter=50)
lr = LogisticRegression(class_weight = 'balanced')
rfe = RFE(lr, n_features_to_select=30)
rfe = rfe.fit(X_train_rec, y_train)

In [47]:
res_rec = X_train_rec.iloc[ : ,rfe.get_support()]
res_rec.columns

Index(['apply', 'awarded', 'box', 'call', 'cash', 'chat', 'claim', 'contact',
       'free', 'ill', 'ltgt', 'mobile', 'new', 'orange', 'pobox', 'prize',
       'receive', 'reply', 'ringtone', 'send', 'service', 'sexy', 'stop',
       'text', 'tone', 'tones', 'txt', 'urgent', 'video', 'win'],
      dtype='object')

Встроенный (регуляризация)

In [48]:
X_train_emb = X_train
select = SelectFromModel(LogisticRegression(penalty='l1',solver='liblinear', random_state=9))
select.fit(X_train_emb, y_train)
res_emb = X_train_emb.iloc[ : , select.get_support()]
res_emb.columns

Index(['account', 'apply', 'ask', 'attempt', 'award', 'awarded', 'back', 'box',
       'call', 'calls', 'camera', 'cant', 'cash', 'chance', 'chat', 'claim',
       'code', 'collection', 'come', 'contact', 'cost', 'customer', 'da',
       'draw', 'end', 'enjoy', 'find', 'free', 'friends', 'fun', 'gift',
       'happy', 'help', 'hey', 'holiday', 'home', 'ill', 'im', 'important',
       'landline', 'later', 'latest', 'line', 'live', 'looking', 'lor', 'ltgt',
       'message', 'mob', 'mobile', 'money', 'msg', 'name', 'network', 'new',
       'next', 'nokia', 'ok', 'orange', 'part', 'per', 'phone', 'play',
       'please', 'pls', 'pm', 'pmin', 'po', 'pobox', 'ppm', 'prize', 'real',
       'receive', 'reply', 'ringtone', 'send', 'service', 'sexy', 'sir', 'sms',
       'special', 'st', 'stop', 'tc', 'text', 'tone', 'tones', 'txt', 'ur',
       'urgent', 'use', 'video', 'waiting', 'wanna', 'weekly', 'win', 'word',
       'xmas'],
      dtype='object')

In [49]:
print('Мои: ')
print(sorted(np.array(res_rfc.index[-30:])))
print(sorted(np.array(res_wrap)))
print(sorted(np.array(res_filt.index[:30])))
print('Библиотечные: ')
print(sorted(np.array(res_fish)))
print(sorted(np.array(res_rec.columns)))
print(sorted(np.array(res_emb.columns[:30])))

Мои: 
['another', 'anyway', 'boy', 'dad', 'den', 'dinner', 'dis', 'eat', 'face', 'fine', 'fuck', 'goes', 'haha', 'lei', 'lunch', 'making', 'maybe', 'might', 'office', 'pain', 'remember', 'room', 'shes', 'shit', 'thank', 'thanx', 'though', 'wake', 'watching', 'went']
['box', 'call', 'cash', 'chat', 'claim', 'code', 'contact', 'customer', 'free', 'guaranteed', 'latest', 'mob', 'mobile', 'new', 'nokia', 'orange', 'per', 'play', 'po', 'pobox', 'prize', 'reply', 'ringtone', 'service', 'stop', 'text', 'tone', 'txt', 'ur', 'win']
['able', 'came', 'class', 'coming', 'day', 'doesnt', 'early', 'go', 'got', 'hi', 'holiday', 'join', 'later', 'ltgt', 'might', 'minutes', 'much', 'name', 'plz', 'run', 'saw', 'sleep', 'st', 'theres', 'todays', 'update', 'went', 'whats', 'yeah', 'yo']
Библиотечные: 
['around', 'best', 'coming', 'da', 'entry', 'god', 'haf', 'id', 'ill', 'kiss', 'later', 'latest', 'liao', 'line', 'may', 'min', 'much', 'orange', 'send', 'sexy', 'shit', 'sir', 'sleep', 'st', 'stop', 'true'

In [50]:
classifiers = {
    'LogisticRegression': LogisticRegression(), 
    'Knn': KNeighborsClassifier(n_neighbors=40), 
    'RandomForestClassifier': RandomForestClassifier()
}

X_train_in_emb = pd.DataFrame(X_train, columns = np.array(res_rfc.index[-30:]))
X_train_wrap = pd.DataFrame(X_train, columns = np.array(res_wrap))
X_train_filt = pd.DataFrame(X_train, columns = np.array(res_filt.index[-30:-1])[::-1])
X_train_fish = pd.DataFrame(X_train, columns = np.array(res_fish))
X_train_rec = pd.DataFrame(X_train, columns = np.array(res_rec.columns))
X_train_res_emb = pd.DataFrame(X_train, columns = np.array(res_emb.columns))

X_test_in_emb = pd.DataFrame(X_test, columns = np.array(res_rfc.index[-30:]))
X_test_wrap = pd.DataFrame(X_test, columns = np.array(res_wrap))
X_test_filt = pd.DataFrame(X_test, columns = np.array(res_filt.index[-30:-1])[::-1])
X_test_fish = pd.DataFrame(X_test, columns = np.array(res_fish))
X_test_rec = pd.DataFrame(X_test, columns = np.array(res_rec.columns))
X_test_res_emb = pd.DataFrame(X_test, columns = np.array(res_emb.columns))

In [51]:
def print_res(c_name, classifier):
    print(c_name)
    
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test) 
    print('До выбора признаков: ', accuracy_score(y_test, y_pred))
    
    print('После выбора признаков')
    
    classifier.fit(X_train_in_emb, y_train)
    y_pred = classifier.predict(X_test_in_emb) 
    print('Random Forest: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_wrap, y_train)
    y_pred = classifier.predict(X_test_wrap) 
    print('Wrap (sequential): ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_filt, y_train)
    y_pred = classifier.predict(X_test_filt) 
    print('Correlation: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_fish, y_train)
    y_pred = classifier.predict(X_test_fish) 
    print('Фишер: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_rec, y_train)
    y_pred = classifier.predict(X_test_rec) 
    print('Рекурсивное исключение: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_res_emb, y_train)
    y_pred = classifier.predict(X_test_res_emb) 
    print('Регуляризация: ', accuracy_score(y_test, y_pred))
    
    print()
    

In [52]:
for cn, cl in classifiers.items():
    print_res(cn, cl)

LogisticRegression
До выбора признаков:  0.9626704953338119
После выбора признаков
Random Forest:  0.8557071069633884
Wrap (sequential):  0.927494615936827
Correlation:  0.8987796123474515
Фишер:  0.867910983488873
Рекурсивное исключение:  0.9310839913854989
Регуляризация:  0.9540559942569993

Knn
До выбора признаков:  0.8557071069633884
После выбора признаков
Random Forest:  0.8557071069633884
Wrap (sequential):  0.9059583632447954
Correlation:  0.9030868628858578
Фишер:  0.8743718592964824
Рекурсивное исключение:  0.9023689877961235
Регуляризация:  0.8650394831299354

RandomForestClassifier
До выбора признаков:  0.9676956209619526
После выбора признаков
Random Forest:  0.8535534816941852
Wrap (sequential):  0.949748743718593
Correlation:  0.9152907394113424
Фишер:  0.8851399856424982
Рекурсивное исключение:  0.9526202440775305
Регуляризация:  0.9619526202440776

