# 특성 선택을 사용한 차원축소

### 분산을 기준으로 수치 특성 선택하기

In [1]:
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold

iris = datasets.load_iris()

features = iris.data
target = iris.target

thresholder = VarianceThreshold(threshold = .5)
features_high_variance = thresholder.fit_transform(features)

features_high_variance[0:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

In [2]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_std  = scaler.fit_transform(features)

selector = VarianceThreshold()
selector.fit(features_std).variances_

array([1., 1., 1., 1.])

### 분산을 기준으로 이진 특성 선택하기

In [10]:
from sklearn.feature_selection import VarianceThreshold

features = [[0,1,0],[0,1,1],[0,1,0],[0,1,1],[1,0,0]]

thresholder = VarianceThreshold(threshold = (0.75 * (1-0.75)))
thresholder.fit_transform(features)

array([[0],
       [1],
       [0],
       [1],
       [0]])

In [11]:
thresholder.variances_

array([0.16, 0.16, 0.24])

### 상관관계가 큰 특성 다루기

In [12]:
import pandas as pd
import numpy as np

features = np.array([[1,1,1],[2,2,0],[3,3,1],[4,4,0],[5,5,1],[6,6,0],[7,7,1],[8,7,0],[9,7,1]])

dataframe = pd.DataFrame(features)

In [13]:
dataframe

Unnamed: 0,0,1,2
0,1,1,1
1,2,2,0
2,3,3,1
3,4,4,0
4,5,5,1
5,6,6,0
6,7,7,1
7,8,7,0
8,9,7,1


In [16]:
corr_matrix = dataframe.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool))

to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

dataframe.drop(dataframe.columns[to_drop],axis = 1)

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1
3,4,0
4,5,1
5,6,0
6,7,1
7,8,0
8,9,1


In [17]:
dataframe.corr()

Unnamed: 0,0,1,2
0,1.0,0.976103,0.0
1,0.976103,1.0,-0.034503
2,0.0,-0.034503,1.0


In [18]:
upper

Unnamed: 0,0,1,2
0,,0.976103,0.0
1,,,0.034503
2,,,


### 분류 작업에 관련 없는 특성 제거하기

In [19]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

iris = load_iris()
features = iris.data
target = iris.target
features = features.astype(int)

chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(features, target)

print('원본 특성 개수:',features.shape[1])
print('줄어든 특성 개수:',features_kbest.shape[1])

원본 특성 개수: 4
줄어든 특성 개수: 2


In [20]:
fvalue_selector = SelectKBest(f_classif,k=2)
features_kbest = fvalue_selector.fit_transform(features, target)

In [21]:
print('원본 특성 개수:',features.shape[1])
print('줄어든 특성 개수:',features_kbest.shape[1])

원본 특성 개수: 4
줄어든 특성 개수: 2


In [23]:
from sklearn.feature_selection import SelectPercentile

fvalue_selector = SelectPercentile(f_classif, percentile = 75)
features_kbest = fvalue_selector.fit_transform(features,target)

print('원본 특성 개수:',features.shape[1])
print('줄어든 특성 개수:',features_kbest.shape[1])

원본 특성 개수: 4
줄어든 특성 개수: 3


In [24]:
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [25]:
observed = np.sum(features.reshape(3,50,4),axis = 1)
observed

array([[230, 152,  50,   0],
       [274, 116, 191,  50],
       [304, 129, 255,  79]])

In [26]:
expected = features.sum(axis = 0) / 3
expected

array([269.33333333, 132.33333333, 165.33333333,  43.        ])

In [27]:
np.sum((observed - expected) ** 2 / expected, axis = 0)

array([ 10.28712871,   5.02267003, 133.06854839,  74.27906977])

In [29]:
chi2_selector.scores_

array([ 10.28712871,   5.02267003, 133.06854839,  74.27906977])

In [30]:
total_mean = np.mean(features, axis = 0)

In [32]:
total_mean

array([5.38666667, 2.64666667, 3.30666667, 0.86      ])

In [33]:
class_mean = np.mean(features.reshape(3,50,4),axis = 1)
class_mean

array([[4.6 , 3.04, 1.  , 0.  ],
       [5.48, 2.32, 3.82, 1.  ],
       [6.08, 2.58, 5.1 , 1.58]])

In [35]:
ss_between = np.sum(50 * (class_mean - total_mean)**2,axis = 0)
ss_between

array([ 55.41333333,  13.29333333, 440.01333333,  63.88      ])

In [37]:
ss_total = np.sum((features - total_mean)**2, axis = 0)
ss_total

array([105.57333333,  42.27333333, 467.89333333,  76.06      ])

In [38]:
f = (ss_between/(3-1))/((ss_total - ss_between)/(150-3))
f

array([  81.19776715,   33.71497585, 1160.00645624,  385.48275862])

In [39]:
fvalue_selector.scores_

array([  81.19715 ,   33.715004, 1160.0116  ,  385.483   ], dtype=float32)

### 재귀적 특성 제거하기

In [40]:
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model

features, target = make_regression(n_samples = 10000, n_features =100, n_informative = 2, random_state = 1)

ols = linear_model.LinearRegression()

rfecv = RFECV(estimator = ols, step = 1, scoring ='neg_mean_squared_error')
rfecv.fit(features,target)
rfecv.transform(features)

array([[ 0.00850799,  0.7031277 ],
       [-1.07500204,  2.56148527],
       [ 1.37940721, -1.77039484],
       ...,
       [-0.80331656, -1.60648007],
       [ 0.39508844, -1.34564911],
       [-0.55383035,  0.82880112]])

In [41]:
rfecv.n_features_

2

In [42]:
rfecv.support_

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [43]:
rfecv.ranking_

array([71, 47, 66, 26, 14,  1, 18,  3, 45, 72, 74, 29,  5, 10, 37, 97, 16,
       83, 73, 11, 99,  7, 57, 63, 89, 49, 96, 43, 55, 46,  9, 75, 30, 94,
       70, 23, 44, 68, 39,  1, 80, 95, 60, 64, 61, 58, 42, 91, 77, 51, 34,
       62, 69, 90, 53, 38, 88, 93, 36, 20, 85, 21, 15, 79, 52, 54, 92, 84,
       76, 98,  4, 82, 41,  2, 65, 81, 48, 40, 13, 56,  8, 32, 25, 19,  6,
       35, 31, 78, 59, 12, 33, 17, 28, 27, 50, 22, 67, 24, 86, 87])