In [2]:
import numpy as np
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import chi2, SelectKBest, SequentialFeatureSelector, SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [3]:
ds = pd.read_csv('data/SMS.tsv',sep='\t')
ds.head(5)

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
ds['class'] = pd.factorize(ds['class'])[0] # ham - 0, spam - 1

In [5]:
y = ds['class']

In [6]:
X_text = []
for s in ds['text']:
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = re.sub('\s+', ' ', s)
    s = re.sub('\d+', '', s)
    X_text.append(s)

In [7]:
vectorizer = TfidfVectorizer(max_features=1500, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(X_text)
X = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names_out())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Встроенный

In [8]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [9]:
coef=pd.Series(rf.feature_importances_, X.columns).sort_values(ascending=False)
print(coef)

txt          0.048580
call         0.044621
free         0.037301
claim        0.036681
mobile       0.024495
               ...   
imagine      0.000000
hv           0.000000
housemaid    0.000000
hospital     0.000000
rose         0.000000
Length: 1500, dtype: float64


In [10]:
coef_emb = pd.Series(rf.feature_importances_, X.columns).sort_values()

обёртка

In [11]:
X_train_filt = X_train

while len(X_train_filt.columns) > 100:
    tree = DecisionTreeClassifier()
    tree.fit(X_train_filt, y_train)
    coef=pd.Series(tree.feature_importances_, X_train_filt.columns).sort_values()
    for i in range(10):
        X_train_filt = X_train_filt.drop(coef.index[i], axis=1)  

coef_wrap = coef

In [12]:
coef_wrap

work     1.107771e-07
today    1.246216e-07
lor      1.271118e-07
da       1.306914e-07
one      1.391835e-07
             ...     
free     2.989365e-02
win      3.387877e-02
stop     5.596393e-02
txt      1.483590e-01
call     2.684631e-01
Length: 110, dtype: float64

фильтр

In [13]:
X_train_filter = X_train

corr = X_train_filter.corr(method='pearson')

corr_class = corr['class']
coef_filt = corr_class.sort_values()

In [14]:
coef_filt

ur        -0.017599
free      -0.015134
call      -0.014677
send      -0.013779
love      -0.013229
             ...   
coming     0.085726
whos       0.110732
teach      0.116322
willing    0.117773
class      1.000000
Name: class, Length: 1500, dtype: float64

In [15]:
print('Встроенный метод: ')
print(np.array(coef_emb.index[-30:])[::-1])
print()
print('Метод обертка: ')
print(np.array(coef_wrap.index[-30:])[::-1])
print()
print('Фильтрующий метод: ')
print(np.array(coef_filt.index[-30:-1])[::-1])

Встроенный метод: 
['txt' 'call' 'free' 'claim' 'mobile' 'stop' 'prize' 'reply' 'win' 'text'
 'service' 'urgent' 'ppm' 'nokia' 'contact' 'chat' 'guaranteed' 'customer'
 'box' 'tone' 'tones' 'new' 'ringtone' 'cash' 'mob' 'per' 'cost' 'po'
 'pmin' 'weekly']

Метод обертка: 
['call' 'txt' 'stop' 'win' 'free' 'claim' 'tones' 'pmsg' 'text' 'im' 'get'
 'user' 'admirer' 'tell' 'pmin' 'send' 'go' 'give' 'ringtone' 'per' 'love'
 'service' 'ppm' 'content' 'mob' 'message' 'time' 'new' 'sexy' 'reply']

Фильтрующий метод: 
['willing' 'teach' 'whos' 'coming' 'lr' 'wheres' 'holder' 'five'
 'finished' 'ugh' 'voucher' 'todays' 'hours' 'yo' 'done' 'post' 'thru'
 'theres' 'saw' 'gym' 'says' 'taking' 'college' 'full' 'go' 'water' 'lets'
 'run' 'idea']


Хи-квадрат

In [16]:
X_train_chi2 = X_train
select_chi2 = SelectKBest(chi2, k = 30)
select_chi2.fit_transform(X_train_chi2, y_train)
X_chi2 = X_train_chi2.iloc[ : ,select_chi2.get_support()]

print(X_chi2.columns)

Index(['apply', 'awarded', 'box', 'call', 'cash', 'claim', 'code', 'contact',
       'customer', 'entry', 'free', 'guaranteed', 'landline', 'mobile',
       'nokia', 'per', 'po', 'ppm', 'prize', 'reply', 'ringtone', 'service',
       'stop', 'text', 'tone', 'tones', 'txt', 'urgent', 'weekly', 'win'],
      dtype='object')


sfs

In [None]:
X_train_sfs = X_train
tree = DecisionTreeClassifier()
sfs = SequentialFeatureSelector(tree, n_features_to_select=30)
sfs.fit(X_train_sfs, y_train)
X_sfs = X_train_sfs.iloc[ : ,sfs.get_support()]

print(X_sfs.columns)

Встроенный

In [None]:
X_train_emb = X_train
selector = SelectFromModel(
    LogisticRegression(C=0.25, penalty='l1',solver='liblinear', random_state=10))

selector.fit(X_train_emb, y_train)
X_emb = X_train_emb.iloc[ : ,selector.get_support()]

print(len(X_emb.columns))
print(X_emb.columns)