In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

data = pd.read_csv('train.csv')
print(data.shape)

(4459, 4993)


In [11]:
null_cols = [col for col in data.columns if data[col].isnull().sum() > 0]
print(null_cols)

[]


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['ID', 'target'], axis=1),
    data['target'],
    test_size=0.2,
    random_state=2018
)

# Filter Methodでの解析例

In [13]:
# 分散が0（すべて同じ値）のデータは削除します
sel = VarianceThreshold(threshold=0)
sel.fit(X_train)

# get_supportで保持するデータのみをTrue値、そうでないものはFalse値を返します
print(sum(sel.get_support()))


4692


In [None]:
# numpy に変換したい場合はこちら
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

# pandasのまま保持して次の処理を行い場合はこちら
X_train = X_train.loc[:, sel.get_support()]
X_test = X_test.loc[:, sel.get_support()]

In [None]:
分散がほぼ0の場合

In [14]:
sel = VarianceThreshold(threshold=0.1) # 99%が同じデータのもの
sel.fit(X_train)

print(sum(sel.get_support()))

4692


In [None]:
# numpy に変換したい場合はこちら
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

# pandasのまま保持して次の処理を行い場合はこちら
X_train = X_train.loc[:, sel.get_support()]
X_test = X_test.loc[:, sel.get_support()]

In [None]:
特徴量がほかの特徴量と完全に一致している場合

In [15]:
# indexとcolumnsを入れ替える
X_train_T = X_train.T

print(X_train_T.duplicated().sum())

# 同じ特徴量の名前を取得したい場合はこちら
duplicated_features = X_train_T[X_train_T.duplicated()].index.values

print(len(X_train.columns))

307
4991


In [None]:
特徴量の相関係数を使用する

In [18]:
threshold = 0.8

feat_corr = set()
corr_matrix = X_train.corr()
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > threshold:
            feat_name = corr_matrix.columns[i]
            feat_corr.add(feat_name)

print(len(set(feat_corr)))

X_train.drop(labels=feat_corr, axis='columns', inplace=True)
X_test.drop(labels=feat_corr, axis='columns', inplace=True)

print(len(X_train.columns))


KeyboardInterrupt: 

In [None]:
統計的評価指標を用いる

In [None]:
# 使用するデータは直前までのFilter Methodを適用させたものが対象です

print(X_train.shape)
# -> (3567, 4025)
print(X_test.shape)
# -> (892, 4025)

In [None]:
Mutual Infomartion
相関係数を使用して特徴量を削減した手法と似ていますが、より一般的な手法になります。

In [None]:
# 使用するモジュール
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile

MI = mutual_info_regression(X_train, y_train)
MI = pd.Series(MI)
MI.index = X_train.columns
MI.sort_values(ascending=False).plot(kind='bar', figsize=(20,10))

# KBest : 抽出する特徴量の"数"を指定
kbest_sel_ = SelectKBest(mutual_info_regression, k=10)
print(len(kbest_sel_.get_support()))

# Percentile : 抽出する特徴量の割合を指定
percentile_sel_ = SelectPercentile(mutual_info_regression, percentile=10)
print(len(percentile_sel_.get_support()))

In [None]:
カイ2乗、フィッシャー係数

In [None]:
# Santandarのデータには適用できないので、簡単な紹介のみ

from sklearn.feature_selection import chi2

# fisher score
fscore = chi2(X_train.fillna(0), y_train)

In [None]:
ターゲットが2値である必要があり、よく連続値の特徴量に対して使用されます。2変数に対して行う手法はANOVAなどと呼ばれます

In [None]:
# Santandarのデータには適用できないので、簡単な紹介のみ

from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile

univariate = f_regression(X_train, y_train)
sel_ = SelectKBest(f_regression, k=1000).fit(X_train, y_train)

In [None]:
Univariate ROC_AUC / RMSE
2変数間の依存性を評価しますが、この手法では機械学習モデルを使用します。

In [None]:
from sklearn.tree import DecesionTreeRegressor
from sklearn.metrics import mean_squared_error, roc_auc_score

MSE_features = []
for feature in X_train.columns:
    clf = DecisionTreeRegressor()
    clf.fit(X_train[feature].to_frame(), y_train)
    y_pred = clf.predict(X_test[feature].to_frame())
    MSE_features.append(mean_squared_error(y_test, y_pred))

MSE_series = pd.Series(MSE_features)
MSE_series.index = X_train.columns
MSE_series.sort_values(ascending=False).plot(kind='bar', figsize=(20, 10))

# Wrapper Methodでの解析例

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

data = pd.read_csv('train.csv')
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['ID', 'target'], axis=1),
    data['target'],
    test_size=0.2,
    random_state=2018
)

# 1. delete const, quisi-const, duplicated 
# 2. correlated 0.8