In [106]:
import re
import string
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from nltk.corpus import stopwords
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from skfeature.function.similarity_based import fisher_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import chi2, SelectKBest, SequentialFeatureSelector, SelectFromModel

In [107]:
df = pd.read_csv('data/SMS.tsv', sep='\t')
df.head(10)

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [114]:
# refactor rename

df['class'] = pd.factorize(df['class'])[0]
text = []
for s in df['text']:
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = re.sub('\s+', ' ', s)
    s = re.sub('\d+', '', s)
    text.append(s)
    
n = 400
vectorizer = TfidfVectorizer(max_features=n, stop_words=stopwords.words('english'))
# X = vectorizer.fit_transform(text)
X = pd.DataFrame(vectorizer.fit_transform(text).toarray(), columns=vectorizer.get_feature_names_out())
X_train, X_test, y_train, y_test = train_test_split(X, df['class'], test_size=0.25, random_state=0)

Встроенный

In [115]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [142]:
# coef = pd.Series(rf.feature_importances_, X_train.columns).sort_values(ascending=False)

# rename
coef_emb = pd.Series(rf.feature_importances_, X_train.columns).sort_values()
coef_emb

though    7.417976e-07
wif       9.591236e-07
shes      1.193961e-06
watch     1.381272e-06
haha      1.789480e-06
              ...     
mobile    3.479455e-02
claim     3.646469e-02
free      3.967451e-02
txt       5.944768e-02
call      7.850275e-02
Length: 400, dtype: float64

обёртка

In [118]:
lr = LogisticRegression()

nof = 30
coef_wrap = []
X_train_seq = X_train
while len(coef_wrap) < nof:
    lr.fit(X_train_seq, y_train)
    model_fi = permutation_importance(lr, X_train_seq, y_train)
    max_value = max(model_fi['importances_mean'])
    max_index = np.argmax(model_fi['importances_mean'])
    print('Step ', (len(coef_wrap) + 1), ', added ', X_train_seq.columns[max_index])
    coef_wrap.append(X_train_seq.columns[max_index])
    X_train_seq = X_train_seq.drop(X_train_seq.columns[max_index], axis=1)

Step  1 , added  call
Step  2 , added  free
Step  3 , added  mobile
Step  4 , added  txt
Step  5 , added  stop
Step  6 , added  text
Step  7 , added  claim
Step  8 , added  reply
Step  9 , added  per
Step  10 , added  ur
Step  11 , added  service
Step  12 , added  new
Step  13 , added  prize
Step  14 , added  win
Step  15 , added  customer
Step  16 , added  code
Step  17 , added  cash
Step  18 , added  account
Step  19 , added  sms
Step  20 , added  contact
Step  21 , added  guaranteed
Step  22 , added  nokia
Step  23 , added  tone
Step  24 , added  shows
Step  25 , added  urgent
Step  26 , added  pobox
Step  27 , added  mob
Step  28 , added  ltgt
Step  29 , added  ringtone
Step  30 , added  mins


фильтр

In [123]:
# rename

X_train_filter = X_train
# corr = X_train_filter.corr(method='pearson')
corr_class = X_train_filter.corr(method='pearson')['class']
coef_filt = corr_class.sort_values()
coef_filt

ur       -0.018241
free     -0.015835
call     -0.015149
like     -0.014854
send     -0.014156
            ...   
yo        0.040357
go        0.048681
todays    0.051914
coming    0.060509
class     1.000000
Name: class, Length: 400, dtype: float64

In [126]:
# refactor rename

print('Встроенный: ', np.array(coef_emb.index[-30:])[::-1])
print('Oбертка: ', np.array(coef_wrap[-30:]))
print('Фильтр: ', np.array(coef_filt.index[-30:-1])[::-1])

Встроенный:  ['call' 'txt' 'free' 'claim' 'mobile' 'stop' 'text' 'prize' 'reply' 'win'
 'service' 'cash' 'guaranteed' 'urgent' 'nokia' 'ppm' 'new' 'per' 'chat'
 'tones' 'contact' 'tone' 'customer' 'send' 'ur' 'box' 'ringtone' 'sms'
 'video' 'code']
Oбертка:  ['call' 'free' 'mobile' 'txt' 'stop' 'text' 'claim' 'reply' 'per' 'ur'
 'service' 'new' 'prize' 'win' 'customer' 'code' 'cash' 'account' 'sms'
 'contact' 'guaranteed' 'nokia' 'tone' 'shows' 'urgent' 'pobox' 'mob'
 'ltgt' 'ringtone' 'mins']
Фильтр:  ['coming' 'todays' 'go' 'yo' 'done' 'saw' 'able' 'st' 'says' 'theres'
 'got' 'run' 'though' 'yeah' 'day' 'join' 'whats' 'might' 'early' 'gonna'
 'tomorrow' 'name' 'sure' 'hi' 'wanna' 'sorry' 'much' 'holiday' 'time']


Фильтр (фишер)

In [127]:
X_train_fish = X_train
ranks = fisher_score.fisher_score(X_train_fish.to_numpy(), y_train.to_numpy())
feature_importances = pd.Series(ranks, X_train_fish.columns)
res_fish = feature_importances.sort_values(ascending=True)[:30].index
res_fish

Index(['liao', 'real', 'school', 'let', 'well', 'yesterday', 'xmas',
       'watching', 'girl', 'ive', 'food', 'cos', 'sleep', 'told', 'start',
       'office', 'home', 'cost', 'sms', 'could', 'really', 'sweet', 'anything',
       'tonight', 'reply', 'stuff', 'lar', 'run', 'fine', 'back'],
      dtype='object')

Обертка (рекурсивное исключение)

In [130]:
X_train_rec = X_train
lr = LogisticRegression(class_weight = 'balanced', solver = 'lbfgs', random_state=42, n_jobs=-1, max_iter=50)
rfe = RFE(lr, n_features_to_select=30)
rfe = rfe.fit(X_train_rec, y_train)

In [131]:
res_rec = X_train_rec.iloc[ : ,rfe.get_support()]
res_rec.columns

Index(['box', 'call', 'cash', 'chat', 'claim', 'code', 'collect', 'contact',
       'cost', 'free', 'gift', 'ill', 'important', 'landline', 'mobile', 'new',
       'pmin', 'prize', 'reply', 'ringtone', 'send', 'service', 'sexy', 'stop',
       'text', 'tone', 'tones', 'txt', 'urgent', 'win'],
      dtype='object')

Встроенный (регуляризация)

In [134]:
# refactor

X_train_emb = X_train
selector = SelectFromModel(LogisticRegression(C=0.25, penalty='l1',solver='liblinear', random_state=10))
selector.fit(X_train_emb, y_train)
X_emb = X_train_emb.iloc[ : ,selector.get_support()]
X_emb.columns

Index(['account', 'box', 'call', 'cash', 'chat', 'claim', 'code', 'collect',
       'contact', 'cost', 'customer', 'find', 'free', 'ill', 'im', 'landline',
       'later', 'ltgt', 'message', 'mins', 'mobile', 'new', 'nokia', 'ok',
       'per', 'ppm', 'prize', 'receive', 'reply', 'ringtone', 'send',
       'service', 'stop', 'text', 'tone', 'tones', 'txt', 'ur', 'urgent',
       'win'],
      dtype='object')

In [135]:
print('Мои: ')
print(sorted(np.array(coef_emb.index[-30:])))
print(sorted(np.array(coef_wrap)))
print(sorted(np.array(coef_filt.index[-30:-1])[::-1]))
print('Библиотечные: ')
print(sorted(np.array(res_fish)))
print(sorted(np.array(res_rec.columns)))
print(sorted(np.array(X_emb.columns)))

Мои: 
['box', 'call', 'cash', 'chat', 'claim', 'code', 'contact', 'customer', 'free', 'guaranteed', 'mobile', 'new', 'nokia', 'per', 'ppm', 'prize', 'reply', 'ringtone', 'send', 'service', 'sms', 'stop', 'text', 'tone', 'tones', 'txt', 'ur', 'urgent', 'video', 'win']
['account', 'call', 'cash', 'claim', 'code', 'contact', 'customer', 'free', 'guaranteed', 'ltgt', 'mins', 'mob', 'mobile', 'new', 'nokia', 'per', 'pobox', 'prize', 'reply', 'ringtone', 'service', 'shows', 'sms', 'stop', 'text', 'tone', 'txt', 'ur', 'urgent', 'win']
['able', 'coming', 'day', 'done', 'early', 'go', 'gonna', 'got', 'hi', 'holiday', 'join', 'might', 'much', 'name', 'run', 'saw', 'says', 'sorry', 'st', 'sure', 'theres', 'though', 'time', 'todays', 'tomorrow', 'wanna', 'whats', 'yeah', 'yo']
Библиотечные: 
['anything', 'back', 'cos', 'cost', 'could', 'fine', 'food', 'girl', 'home', 'ive', 'lar', 'let', 'liao', 'office', 'real', 'really', 'reply', 'run', 'school', 'sleep', 'sms', 'start', 'stuff', 'sweet', 'told'

In [136]:
classifiers = {
    'LogisticRegression': LogisticRegression(), 
    'Knn': KNeighborsClassifier(n_neighbors=100), 
    'RandomForestClassifier': RandomForestClassifier()
}

X_train_in_emb = pd.DataFrame(X_train, columns = np.array(coef_emb.index[-30:]))
X_train_wrap = pd.DataFrame(X_train, columns = np.array(coef_wrap))
X_train_filt = pd.DataFrame(X_train, columns = np.array(coef_filt.index[-30:-1])[::-1])
X_train_fish = pd.DataFrame(X_train, columns = np.array(res_fish))
X_train_rec = pd.DataFrame(X_train, columns = np.array(res_rec.columns))
X_train_X_emb = pd.DataFrame(X_train, columns = np.array(X_emb.columns))

X_test_in_emb = pd.DataFrame(X_test, columns = np.array(coef_emb.index[-30:]))
X_test_wrap = pd.DataFrame(X_test, columns = np.array(coef_wrap))
X_test_filt = pd.DataFrame(X_test, columns = np.array(coef_filt.index[-30:-1])[::-1])
X_test_fish = pd.DataFrame(X_test, columns = np.array(res_fish))
X_test_rec = pd.DataFrame(X_test, columns = np.array(res_rec.columns))
X_test_X_emb = pd.DataFrame(X_test, columns = np.array(X_emb.columns))

In [137]:
def print_res(c_name, classifier):
    print(c_name)
    
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test) 
    print('До выбора признаков: ', accuracy_score(y_test, y_pred))
    
    print('После выбора признаков')
    
    classifier.fit(X_train_in_emb, y_train)
    y_pred = classifier.predict(X_test_in_emb) 
    print('Random Forest: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_wrap, y_train)
    y_pred = classifier.predict(X_test_wrap) 
    print('Wrap (sequential): ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_filt, y_train)
    y_pred = classifier.predict(X_test_filt) 
    print('Correlation: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_fish, y_train)
    y_pred = classifier.predict(X_test_fish) 
    print('Фишер: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_rec, y_train)
    y_pred = classifier.predict(X_test_rec) 
    print('Рекурсивное исключение: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_X_emb, y_train)
    y_pred = classifier.predict(X_test_X_emb) 
    print('Регуляризация: ', accuracy_score(y_test, y_pred))
    
    print()
    

In [140]:
for cn, cl in classifiers.items():
    print_res(cn, cl)

LogisticRegression
До выбора признаков:  0.9734386216798278
После выбора признаков
Random Forest:  0.9490308686288585
Wrap (sequential):  0.9483129935391242
Correlation:  0.8664752333094041
Фишер:  0.867910983488873
Рекурсивное исключение:  0.9454414931801867
Регуляризация:  0.955491744436468

Knn
До выбора признаков:  0.8671931083991385
После выбора признаков
Random Forest:  0.8837042354630295
Wrap (sequential):  0.8901651112706389
Correlation:  0.8671931083991385
Фишер:  0.87724335965542
Рекурсивное исключение:  0.8937544867193108
Регуляризация:  0.8786791098348887

RandomForestClassifier
До выбора признаков:  0.9791816223977028
После выбора признаков
Random Forest:  0.9641062455132807
Wrap (sequential):  0.9619526202440776
Correlation:  0.8793969849246231
Фишер:  0.8793969849246231
Рекурсивное исключение:  0.9691313711414213
Регуляризация:  0.9676956209619526

