In [67]:
import numpy as np
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import chi2, SelectKBest, SequentialFeatureSelector, SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance
from skfeature.function.similarity_based import fisher_score
from sklearn.feature_selection import RFE

In [2]:
ds = pd.read_csv('data/SMS.tsv',sep='\t')
ds.head(5)

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
ds['class'] = pd.factorize(ds['class'])[0] # ham - 0, spam - 1

In [4]:
y = ds['class']

In [5]:
X_text = []
for s in ds['text']:
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = re.sub('\s+', ' ', s)
    s = re.sub('\d+', '', s)
    X_text.append(s)

In [39]:
vectorizer = TfidfVectorizer(max_features=400, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(X_text)
X = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names_out())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

1114    0
3589    0
3095    0
1012    0
3320    0
       ..
4931    1
3264    0
1653    1
2607    0
2732    0
Name: class, Length: 4457, dtype: int64

Встроенный

In [7]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [8]:
coef=pd.Series(rf.feature_importances_, X.columns).sort_values(ascending=False)
print(coef)

call      7.717350e-02
txt       5.760009e-02
free      4.776983e-02
claim     4.060314e-02
mobile    3.559542e-02
              ...     
hair      1.604888e-06
shes      1.433198e-06
dude      6.066155e-07
face      5.481904e-07
called    4.026351e-08
Length: 400, dtype: float64


In [9]:
coef_emb = pd.Series(rf.feature_importances_, X.columns).sort_values()

обёртка

In [29]:
lr = LogisticRegression()
n = 400
# X_train

coef_wrap = []
features_rest = X_train
num_of_features = 30
while len(coef_wrap) < num_of_features:
    lr.fit(features_rest, y_train)
    model_fi = permutation_importance(lr, features_rest, y_train)
    max_value = max(model_fi['importances_mean'])
    max_index = np.argmax(model_fi['importances_mean'])
    print('Step ', (len(coef_wrap) + 1), ', added ', features_rest.columns[max_index])
    coef_wrap.append(features_rest.columns[max_index])
    features_rest = features_rest.drop(features_rest.columns[max_index], axis=1)

Step  30 , added  call
Step  30 , added  free
Step  30 , added  stop
Step  30 , added  text
Step  30 , added  reply
Step  30 , added  mobile
Step  30 , added  claim
Step  30 , added  txt
Step  30 , added  ur
Step  30 , added  service
Step  30 , added  win
Step  30 , added  prize
Step  30 , added  contact
Step  30 , added  per
Step  30 , added  ltgt
Step  30 , added  urgent
Step  30 , added  customer
Step  30 , added  cash
Step  30 , added  guaranteed
Step  30 , added  code
Step  30 , added  shows
Step  30 , added  latest
Step  30 , added  nokia
Step  30 , added  tone
Step  30 , added  new
Step  30 , added  ringtone
Step  30 , added  mins
Step  30 , added  awarded
Step  30 , added  pobox
Step  30 , added  mob


In [10]:
# X_train_filt = X_train

# while len(X_train_filt.columns) > 100:
#     tree = DecisionTreeClassifier()
#     tree.fit(X_train_filt, y_train)
#     coef=pd.Series(tree.feature_importances_, X_train_filt.columns).sort_values()
#     for i in range(10):
#         X_train_filt = X_train_filt.drop(coef.index[i], axis=1)  

# coef_wrap = coef

In [11]:
# coef_wrap

gud       4.028746e-07
also      4.360379e-07
said      4.754627e-07
hey       5.177381e-07
lol       5.274363e-07
              ...     
mobile    3.402037e-02
win       3.614163e-02
stop      4.696544e-02
txt       1.533898e-01
call      2.510207e-01
Length: 110, dtype: float64

фильтр

In [12]:
X_train_filter = X_train

corr = X_train_filter.corr(method='pearson')

corr_class = corr['class']
coef_filt = corr_class.sort_values()

In [13]:
coef_filt

ur       -0.017787
free     -0.015515
call     -0.014843
send     -0.013854
text     -0.013242
            ...   
yo        0.038234
todays    0.045150
go        0.046247
coming    0.057272
class     1.000000
Name: class, Length: 400, dtype: float64

In [14]:
print('Встроенный метод: ')
print(np.array(coef_emb.index[-30:])[::-1])
print()
print('Метод обертка: ')
print(np.array(coef_wrap.index[-30:])[::-1])
print()
print('Фильтрующий метод: ')
print(np.array(coef_filt.index[-30:-1])[::-1])

Встроенный метод: 
['call' 'txt' 'free' 'claim' 'mobile' 'stop' 'prize' 'text' 'win' 'reply'
 'service' 'cash' 'urgent' 'nokia' 'contact' 'chat' 'tone' 'ur' 'box'
 'per' 'customer' 'tones' 'send' 'new' 'ppm' 'ringtone' 'guaranteed' 'sms'
 'apply' 'landline']

Метод обертка: 
['call' 'txt' 'stop' 'win' 'mobile' 'send' 'text' 'free' 'tones' 'reply'
 'claim' 'ur' 'im' 'got' 'get' 'tell' 'sexy' 'pmin' 'later' 'important'
 'contact' 'new' 'per' 'cost' 'time' 'give' 'calls' 'ringtone' 'box' 'ask']

Фильтрующий метод: 
['coming' 'go' 'todays' 'yo' 'done' 'able' 'saw' 'st' 'says' 'theres'
 'got' 'run' 'though' 'yeah' 'day' 'join' 'early' 'might' 'whats'
 'tomorrow' 'minutes' 'gonna' 'hi' 'sure' 'name' 'wanna' 'holiday' 'sorry'
 'us']


In [60]:
X_train_fish = X_train
ranks = fisher_score.fisher_score(X_train_fish.to_numpy(), y_train.to_numpy())
feature_importances = pd.Series(ranks, X_train_fish.columns)

In [85]:
# print(feature_importances.sort_values(ascending=True)[:40])
res_fish = feature_importances.sort_values(ascending=True)[:30].index
res_fish
# print(feature_importances[:40])
# print(X_train_fish)

Index(['juz', 'princess', 'say', 'nice', 'way', 'xmas', 'world', 'whats',
       'get', 'im', 'everything', 'cool', 'shes', 'tmr', 'talk', 'night', 'hi',
       'customer', 'stop', 'cos', 'plus', 'take', 'anyway', 'tomorrow', 'real',
       'something', 'know', 'remember', 'friend', 'bed'],
      dtype='object')

In [69]:
X_train_rec = X_train
lr = LogisticRegression(class_weight = 'balanced', solver = 'lbfgs', random_state=42, n_jobs=-1, max_iter=50)
rfe = RFE(lr, n_features_to_select=30)
rfe = rfe.fit(X_train_rec, y_train)
# y_pred = X_train_rec.iloc[ : ,rfe.get_support()]
# y_pred = rfe.predict(X_train_rec)

In [72]:
res_rec = X_train_rec.iloc[ : ,rfe.get_support()]
print(res_rec.columns)

Index(['box', 'call', 'cash', 'chat', 'claim', 'code', 'collect', 'contact',
       'cost', 'free', 'gift', 'ill', 'important', 'landline', 'mob', 'mobile',
       'pmin', 'prize', 'reply', 'ringtone', 'send', 'service', 'sexy', 'stop',
       'text', 'tone', 'tones', 'txt', 'urgent', 'win'],
      dtype='object')


Хи-квадрат

In [66]:
# X_train_chi2 = X_train
# select_chi2 = SelectKBest(chi2, k = 30)
# select_chi2.fit_transform(X_train_chi2, y_train)
# X_chi2 = X_train_chi2.iloc[ : ,select_chi2.get_support()]

# print(X_chi2.columns)

Index(['awarded', 'box', 'call', 'cash', 'claim', 'code', 'contact',
       'customer', 'free', 'guaranteed', 'landline', 'latest', 'mobile',
       'nokia', 'per', 'po', 'pobox', 'ppm', 'prize', 'reply', 'ringtone',
       'service', 'shows', 'stop', 'text', 'tone', 'tones', 'txt', 'urgent',
       'win'],
      dtype='object')


sfs

In [16]:
# X_train_sfs = X_train
# tree = DecisionTreeClassifier()
# sfs = SequentialFeatureSelector(tree, n_features_to_select=30)
# sfs.fit(X_train_sfs, y_train)
# X_sfs = X_train_sfs.iloc[ : ,sfs.get_support()]

# print(X_sfs.columns)

Index(['call', 'cant', 'chat', 'claim', 'contact', 'dis', 'everything', 'free',
       'good', 'help', 'mobile', 'oso', 'per', 'play', 'pmin', 'ppm', 'quite',
       'reply', 'ringtone', 'service', 'shows', 'sure', 'tell', 'tmr', 'told',
       'tones', 'txt', 'urgent', 'win', 'xmas'],
      dtype='object')


Встроенный

In [17]:
X_train_emb = X_train
selector = SelectFromModel(
    LogisticRegression(C=0.25, penalty='l1',solver='liblinear', random_state=10))

selector.fit(X_train_emb, y_train)
X_emb = X_train_emb.iloc[ : ,selector.get_support()]

print(len(X_emb.columns))
print(X_emb.columns)

44
Index(['account', 'apply', 'box', 'call', 'cash', 'chat', 'claim', 'code',
       'collect', 'contact', 'cost', 'customer', 'find', 'free', 'help', 'ill',
       'im', 'landline', 'later', 'ltgt', 'message', 'mins', 'mobile', 'new',
       'nokia', 'ok', 'per', 'please', 'ppm', 'prize', 'receive', 'reply',
       'ringtone', 'send', 'service', 'shows', 'stop', 'text', 'tone', 'tones',
       'txt', 'ur', 'urgent', 'win'],
      dtype='object')


In [86]:
print('Мои реализации: ')
print(sorted(np.array(coef_emb.index[-30:])))
print(sorted(np.array(coef_wrap)))
print(sorted(np.array(coef_filt.index[-30:-1])[::-1]))
print()
print('Библиотечные: ')
print(sorted(np.array(res_fish)))
print(sorted(np.array(res_rec.columns)))
print(sorted(np.array(X_emb.columns)))

Мои реализации: 
['apply', 'box', 'call', 'cash', 'chat', 'claim', 'contact', 'customer', 'free', 'guaranteed', 'landline', 'mobile', 'new', 'nokia', 'per', 'ppm', 'prize', 'reply', 'ringtone', 'send', 'service', 'sms', 'stop', 'text', 'tone', 'tones', 'txt', 'ur', 'urgent', 'win']
['awarded', 'call', 'cash', 'claim', 'code', 'contact', 'customer', 'free', 'guaranteed', 'latest', 'ltgt', 'mins', 'mob', 'mobile', 'new', 'nokia', 'per', 'pobox', 'prize', 'reply', 'ringtone', 'service', 'shows', 'stop', 'text', 'tone', 'txt', 'ur', 'urgent', 'win']
['able', 'coming', 'day', 'done', 'early', 'go', 'gonna', 'got', 'hi', 'holiday', 'join', 'might', 'minutes', 'name', 'run', 'saw', 'says', 'sorry', 'st', 'sure', 'theres', 'though', 'todays', 'tomorrow', 'us', 'wanna', 'whats', 'yeah', 'yo']

Библиотечные: 
['anyway', 'bed', 'cool', 'cos', 'customer', 'everything', 'friend', 'get', 'hi', 'im', 'juz', 'know', 'nice', 'night', 'plus', 'princess', 'real', 'remember', 'say', 'shes', 'something', '

In [87]:
classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(n_neighbors=100),
    RandomForestClassifier()
]

In [96]:
X_train_in_emb = pd.DataFrame(X_train, columns = np.array(coef_emb.index[-30:]))
X_train_wrap = pd.DataFrame(X_train, columns = np.array(coef_wrap))
X_train_filt = pd.DataFrame(X_train, columns = np.array(coef_filt.index[-30:-1])[::-1])
X_train_fish = pd.DataFrame(X_train, columns = np.array(res_fish))
X_train_rec = pd.DataFrame(X_train, columns = np.array(res_rec.columns))
X_train_X_emb = pd.DataFrame(X_train, columns = np.array(X_emb.columns))

X_test_in_emb = pd.DataFrame(X_test, columns = np.array(coef_emb.index[-30:]))
X_test_wrap = pd.DataFrame(X_test, columns = np.array(coef_wrap))
X_test_filt = pd.DataFrame(X_test, columns = np.array(coef_filt.index[-30:-1])[::-1])
X_test_fish = pd.DataFrame(X_test, columns = np.array(res_fish))
X_test_rec = pd.DataFrame(X_test, columns = np.array(res_rec.columns))
X_test_X_emb = pd.DataFrame(X_test, columns = np.array(X_emb.columns))

In [99]:
# X_train_in_emb
# X_train_wrap
# X_train_filt
# X_train_fish
# X_train_rec
# X_train_X_emb

def print_res(classifier):
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test) 
    print('До выбора признаков: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_in_emb, y_train)
    y_pred = classifier.predict(X_test_in_emb) 
    print('После выбора признаков: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_wrap, y_train)
    y_pred = classifier.predict(X_test_wrap) 
    print('После выбора признаков: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_filt, y_train)
    y_pred = classifier.predict(X_test_filt) 
    print('После выбора признаков: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_fish, y_train)
    y_pred = classifier.predict(X_test_fish) 
    print('После выбора признаков: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_rec, y_train)
    y_pred = classifier.predict(X_test_rec) 
    print('После выбора признаков: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_X_emb, y_train)
    y_pred = classifier.predict(X_test_X_emb) 
    print('После выбора признаков: ', accuracy_score(y_test, y_pred))
    

In [100]:
for cl in classifiers:
    print_res(cl)

До выбора признаков:  0.9766816143497757
После выбора признаков:  0.9443946188340807
После выбора признаков:  0.9461883408071748
После выбора признаков:  0.8565022421524664
После выбора признаков:  0.862780269058296
После выбора признаков:  0.9399103139013453
После выбора признаков:  0.9542600896860987
До выбора признаков:  0.8565022421524664
После выбора признаков:  0.8771300448430494
После выбора признаков:  0.8807174887892377
После выбора признаков:  0.8565022421524664
После выбора признаков:  0.8672645739910314
После выбора признаков:  0.8896860986547085
После выбора признаков:  0.8663677130044843
До выбора признаков:  0.9775784753363229
После выбора признаков:  0.9614349775784753
После выбора признаков:  0.9614349775784753
После выбора признаков:  0.873542600896861
После выбора признаков:  0.8834080717488789
После выбора признаков:  0.9659192825112107
После выбора признаков:  0.968609865470852
