In [9]:
import re
import string
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.feature_extraction.text import TfidfVectorizer
from skfeature.function.similarity_based import fisher_score
from sklearn.feature_selection import chi2, SelectKBest, SequentialFeatureSelector, SelectFromModel

In [10]:
df = pd.read_csv('data/SMS.tsv', sep='\t')
df.head(10)

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [11]:
def ref_str(s):
    s = re.sub('\d+', '', s)
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = re.sub('\s+', ' ', s)
    return s

df['class'] = pd.factorize(df['class'])[0]
text = []
for s in df['text']:
    text.append(ref_str(s))
    
n = 100
vectorizer = TfidfVectorizer(max_features = n, stop_words=stopwords.words('english'))
X = pd.DataFrame(vectorizer.fit_transform(text).toarray(), columns=vectorizer.get_feature_names_out())
X_train, X_test, y_train, y_test = train_test_split(X, df['class'])
# n = X.shape[1]

Встроенный

In [12]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [13]:
res_rfc = pd.Series(rf.feature_importances_, X_train.columns).sort_values(ascending=False)
res_rfc

call        0.154749
txt         0.086333
free        0.078856
claim       0.064593
mobile      0.046757
              ...   
cos         0.000445
say         0.000269
anything    0.000181
lol         0.000135
yeah        0.000064
Length: 100, dtype: float64

обёртка

In [None]:
lr = LogisticRegression()

nof = 30
res_wrap = []
X_train_seq = X_train
while len(res_wrap) < nof:
    lr.fit(X_train_seq, y_train)
    model_fi = permutation_importance(lr, X_train_seq, y_train)
    max_value = max(model_fi['importances_mean'])
    max_index = np.argmax(model_fi['importances_mean'])
    print('Step ', (len(res_wrap) + 1), ', added ', X_train_seq.columns[max_index])
    res_wrap.append(X_train_seq.columns[max_index])
    X_train_seq = X_train_seq.drop(X_train_seq.columns[max_index], axis=1)

Step  1 , added  call
Step  2 , added  claim
Step  3 , added  prize
Step  4 , added  mobile


фильтр

In [None]:
X_train_filter = X_train
corr = X_train_filter.corr(method='pearson')['class']
res_filt = corr.sort_values(ascending=False)
res_filt

In [None]:
print('Встроенный: ', np.array(res_rfc.index[:30]))
print('Oбертка: ', np.array(res_wrap))
print('Фильтр: ', np.array(res_filt.index[:30]))

Фильтр (фишер)

In [None]:
X_train_fish = X_train
ranks = fisher_score.fisher_score(X_train_fish.to_numpy(), y_train.to_numpy())
feature_importances = pd.Series(ranks, X_train_fish.columns)
res_fish = feature_importances.sort_values(ascending=True)[:30].index
res_fish

Обертка (рекурсивное исключение)

In [None]:
X_train_rec = X_train
lr = LogisticRegression(class_weight = 'balanced')
rfe = RFE(lr, n_features_to_select=30)
rfe = rfe.fit(X_train_rec, y_train)

In [None]:
res_rec = X_train_rec.iloc[ : ,rfe.get_support()]
res_rec.columns

Встроенный (регуляризация)

In [None]:
X_train_emb = X_train
select = SelectFromModel(LogisticRegression(penalty='l1',solver='liblinear', random_state=9))
select.fit(X_train_emb, y_train)
res_emb = X_train_emb.iloc[ : , select.get_support()]
res_emb.columns

In [None]:
print('Мои: ')
print(sorted(np.array(res_rfc.index[-30:])))
print(sorted(np.array(res_wrap)))
print(sorted(np.array(res_filt.index[:30])))
print('Библиотечные: ')
print(sorted(np.array(res_fish)))
print(sorted(np.array(res_rec.columns)))
print(sorted(np.array(res_emb.columns[:30])))

In [None]:
classifiers = {
    'LogisticRegression': LogisticRegression(), 
    'Knn': KNeighborsClassifier(n_neighbors=40), 
    'RandomForestClassifier': RandomForestClassifier()
}

X_train_in_emb = pd.DataFrame(X_train, columns = np.array(res_rfc.index[-30:]))
X_train_wrap = pd.DataFrame(X_train, columns = np.array(res_wrap))
X_train_filt = pd.DataFrame(X_train, columns = np.array(res_filt.index[:30]))
X_train_fish = pd.DataFrame(X_train, columns = np.array(res_fish))
X_train_rec = pd.DataFrame(X_train, columns = np.array(res_rec.columns))
X_train_res_emb = pd.DataFrame(X_train, columns = np.array(res_emb.columns[:30]))

X_test_in_emb = pd.DataFrame(X_test, columns = np.array(res_rfc.index[-30:]))
X_test_wrap = pd.DataFrame(X_test, columns = np.array(res_wrap))
X_test_filt = pd.DataFrame(X_test, columns = np.array(res_filt.index[:30]))
X_test_fish = pd.DataFrame(X_test, columns = np.array(res_fish))
X_test_rec = pd.DataFrame(X_test, columns = np.array(res_rec.columns))
X_test_res_emb = pd.DataFrame(X_test, columns = np.array(res_emb.columns[:30]))

In [None]:
def print_res(c_name, classifier):
    print(c_name)
    
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test) 
    print('До выбора признаков: ', accuracy_score(y_test, y_pred))
    
    print('После выбора признаков')
    
    classifier.fit(X_train_in_emb, y_train)
    y_pred = classifier.predict(X_test_in_emb) 
    print('Random Forest: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_wrap, y_train)
    y_pred = classifier.predict(X_test_wrap) 
    print('Wrap (sequential): ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_filt, y_train)
    y_pred = classifier.predict(X_test_filt) 
    print('Correlation: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_fish, y_train)
    y_pred = classifier.predict(X_test_fish) 
    print('Фишер: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_rec, y_train)
    y_pred = classifier.predict(X_test_rec) 
    print('Рекурсивное исключение: ', accuracy_score(y_test, y_pred))
    
    classifier.fit(X_train_res_emb, y_train)
    y_pred = classifier.predict(X_test_res_emb) 
    print('Регуляризация: ', accuracy_score(y_test, y_pred))
    
    print()
    

In [None]:
for cn, cl in classifiers.items():
    print_res(cn, cl)