In [1]:
import sklearn as sk
import numpy as np

In [2]:
%%time
from sklearn.datasets import fetch_rcv1
rcv_train = fetch_rcv1(subset="train")
rcv_test = fetch_rcv1(subset="test")
X_train = rcv_train.data
y_train = rcv_train.target
X_test = rcv_test.data
y_test = rcv_test.target

# Ont-Hot-Encoding된 라벨을 정수형으로 복원
classes = np.arange(rcv_train.target.shape[1])
y_train = y_train.dot(classes)
y_test = y_test.dot(classes)

print(X_train.shape)

(23149, 47236)
Wall time: 21.4 s


In [None]:
from sklearn

## 분산에 의한 선택

In [3]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(1e-5)  # 임계값보다 낮은 분산은 삭제됨
X_train_sel = selector.fit_transform(X_train)
X_test_sel = selector.transform(X_test)
X_train_sel.shape

(23149, 14330)

In [4]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

In [5]:
%%time
model = BernoulliNB()
model.fit(X_train, y_train)
print('train accuracy : {:5.3f}'.format(accuracy_score(y_train, model.predict(X_train))))
print('test accuracy : {:5.3f}'.format(accuracy_score(y_test, model.predict(X_test))))

train accuracy : 0.381
test accuracy : 0.324
Wall time: 48.9 s


In [6]:
%%time
model = BernoulliNB()
model.fit(X_train_sel, y_train)
print('train accuracy : {:5.3f}'.format(accuracy_score(y_train, model.predict(X_train_sel))))
print('test accuracy : {:5.3f}'.format(accuracy_score(y_test, model.predict(X_test_sel))))

train accuracy : 0.529
test accuracy : 0.441
Wall time: 39.5 s


## 단일 변수 선택

In [7]:
from sklearn.feature_selection import chi2, SelectKBest

In [9]:
%%time

selector1 = SelectKBest(chi2, k=14330)
X_train1 = selector1.fit_transform(X_train, y_train)
X_test1 = selector1.transform(X_test)

model = BernoulliNB()
model.fit(X_train1, y_train)
print("train accuracy:{:5.3f}".format(accuracy_score(y_train, model.predict(X_train1))))
print("test accuracy :{:5.3f}".format(accuracy_score(y_test, model.predict(X_test1))))

train accuracy:0.505
test accuracy :0.438
Wall time: 33.6 s


## 다른 모형을 이용한 특성  중요도 계산

In [13]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

In [14]:
%%time
n_sample = 10000
idx = np.random.choice(range(len(y_train)), n_sample)
model_sel = ExtraTreesClassifier(n_estimators=50).fit(X_train[idx, :], y_train[idx]) #무작위로 특성선택
selector = SelectFromModel(model_sel, prefit=True, max_features=14330)
X_train_sel = selector.transform(X_train)
X_test_sel = selector.transform(X_test)

Wall time: 55.8 s


In [15]:
%%time
model=BernoulliNB()
model.fit(X_train_sel, y_train)
print("train accuracy:{:5.3f}".format(accuracy_score(y_train, model.predict(X_train_sel))))
print("test accuracy :{:5.3f}".format(accuracy_score(y_test, model.predict(X_test_sel))))

train accuracy:0.603
test accuracy :0.491
Wall time: 30.8 s
