In [5]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
%run ./common_init.ipynb

In [7]:
%autoreload 2
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

# Load custom code
import kdd98.data_handler as dh
from kdd98.config import Config

In [8]:
# Where to save the figures
PROJECT_ROOT_DIR = "../../"
CHAPTER_ID = "feature_extraction"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "figures", CHAPTER_ID)

if not os.path.exists(IMAGES_PATH):
    os.makedirs(IMAGES_PATH)


def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [9]:
data_provider = dh.KDD98DataProvider("cup98LRN.txt")

In [None]:
kdd98_numeric = data_provider.numeric_data

[IterativeImputer] Completing matrix with shape (95412, 418)
[IterativeImputer] Ending imputation round 1/5, elapsed time 238.88
[IterativeImputer] Ending imputation round 2/5, elapsed time 474.67
[IterativeImputer] Ending imputation round 3/5, elapsed time 712.31


## Finding relevant features

In [None]:
seed = Config.get("random_seed")
from sklearn.model_selection import StratifiedShuffleSplit

splitter = StratifiedShuffleSplit(
    n_splits=1, test_size=0.2, train_size=0.8, random_state=seed)
for learn_index, test_index in splitter.split(
        kdd98_numeric, kdd98_numeric.TARGET_B.astype('int')):
    l_i = learn_index
    t_i = test_index
    kdd98_learn = kdd98_numeric.iloc[learn_index]
    kdd98_test = kdd98_numeric.iloc[test_index]

In [None]:
kdd98_learn_feat = kdd98_numeric.drop(['TARGET_B', 'TARGET_D'], axis=1).copy()
kdd98_learn_targets = kdd98_numeric[['TARGET_B', 'TARGET_D']].copy()

In [None]:
kdd98_learn_feat.DOB

In [None]:
kdd98_learn_targets.describe()

### Boruta

In [None]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

In [None]:
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced_subsample', max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)

In [None]:
# find all relevant features
feat_selector.fit(kdd98_learn_feat.values, kdd98_learn_targets.TARGET_B.values)

In [None]:
# check selected features
feat_selector.support_

In [None]:
# check ranking of features
feat_selector.ranking_

In [None]:
# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(kdd98_learn_feat.values)

In [None]:
kdd98_learn_feat.columns.values[feat_selector.support_]

In [None]:
kdd98_relevant = pd.DataFrame(data=X_filtered, columns = kdd98_learn_feat.columns.values[feat_selector.support_])

In [None]:
data_provider.raw_data.INCOME.isna().sum()

In [None]:
data_provider.clean_data.INCOME.isna().sum()

In [None]:
kdd98_numeric.INCOME.value_counts()

In [None]:
from sklearn import decomposition
from sklearn import preprocessing
from sklearn.pipeline import Pipeline

In [None]:
scaled = preprocessing.StandardScaler().fit_transform(kdd98_learn_feat)
pca = decomposition.PCA(n_components=0.8)
pca.fit(scaled)
kdd98_learn_feat_pca = pca.transform(scaled)

In [None]:
kdd98_learn_feat_pca

In [None]:
pca.components_

In [None]:
pca.explained_variance_ratio_

In [None]:
pca.n_components_

In [None]:
pca.n_features_

In [None]:
pca.get_params