## Loading data
- 735 datapoints each having 24 features for 60 timesteps
- data.shape => (735,60,24)  => (datapoints, timesteps, features)

In [1]:
import pandas as pd
import numpy as np
data = np.load("sample_data.npz")

### FSS on TS data (3 dimentions)

### FCBF - Fast Correlation Based Filter 

In [21]:
from src.fss.fcbf.fcbf import FCBF
fcbf = FCBF(data=data)    ## data is dictionary with 2 keys "np_data" having 3d numpy data and "subclass" corresponding to target classes
fcbf_rank = fcbf.rank()
fcbf_rank.head()

Unnamed: 0,Feature,Score
0,R_VALUE,0.206089
1,SHRGT45,0.121955
2,TOTUSJH,0.118166
3,TOTBSQ,0.118166
4,TOTUSJZ,0.118166


## CSFS

In [24]:
# from src.fss.csfs.csfs import CSFS
# csfs = CSFS(data=data)    ## data is dictionary with 2 keys "np_data" having 3d numpy data and "subclass" corresponding to target classes
# csfs_rank = csfs.rank()
# csfs_rank.head()

## Vectorizing data
- (735,60,24) -> (735,168)
- where 168 corresponds represents 24 features with 7 statistical features each

In [2]:
from src.preprocessing.vectorize import vectorize
vectorized_data = vectorize(data['np_data'])
print(vectorized_data.shape)
vectorized_data.head()

(735, 168)


Unnamed: 0,TOTUSJH_min,TOTBSQ_min,TOTPOT_min,TOTUSJZ_min,ABSNJZH_min,SAVNCPP_min,USFLUX_min,TOTFZ_min,MEANPOT_min,EPSZ_min,...,MEANGBZ_kurtosis,MEANGBH_kurtosis,MEANJZH_kurtosis,TOTFY_kurtosis,MEANJZD_kurtosis,MEANALP_kurtosis,TOTFX_kurtosis,EPSY_kurtosis,EPSX_kurtosis,R_VALUE_kurtosis
0,0.230355,0.617412,0.001986,0.151498,0.01656,0.024169,0.535292,0.405347,1.9e-05,0.435331,...,-0.795735,-0.658184,0.375842,-1.286032,0.401637,0.424243,-1.060712,-1.220763,-1.58209,-1.04509
1,0.040939,0.060785,0.000104,0.02922,0.023403,0.055886,0.091372,0.730117,5e-06,0.273792,...,-0.839596,-0.248791,0.762418,-0.712628,0.118049,0.732245,-0.973002,-0.756112,-0.506833,-0.998586
2,0.13019,0.419234,0.000963,0.090657,0.045061,0.02898,0.512086,0.305961,9e-06,0.327511,...,-0.01893,-0.873333,-1.380979,-0.815605,-0.577776,-1.296177,-1.018526,-0.871618,-1.092397,-1.552892
3,0.121122,0.317159,0.001095,0.084523,0.001628,0.05247,0.231725,0.766656,2.3e-05,0.513382,...,-1.365317,-1.434476,-0.256338,-1.151957,-0.488195,0.150881,-1.283992,-1.20704,-1.272945,-0.822295
4,0.114146,0.331614,0.000917,0.075171,0.288566,0.266177,0.288001,0.578099,1.6e-05,0.408842,...,-1.132442,-0.997534,-0.928597,-1.441214,-0.945731,-0.581695,-0.880224,-1.518133,0.305576,-0.742629


## Binarizing target

In [3]:
y_train_bin = np.where(data['target']=='NF',0,1)

### MRMR - Maximum Relevance Minimum Redundancy

In [18]:
from src.fss.mrmr.mrmr import mrmr_ranking
mrmr_rank = mrmr_ranking(vectorized_data, y_train_bin)
mrmr_rank.head()


100%|██████████| 168/168 [00:04<00:00, 36.03it/s]


Unnamed: 0,Feature,Score
0,TOTBSQ,0.024221
1,ABSNJZH,0.023333
2,TOTPOT,0.020528
3,USFLUX,0.020173
4,TOTUSJH,0.019718


### RelieF

In [19]:
from src.fss.relief.relief import relief_ranking
relief_rank = relief_ranking(vectorized_data, y_train_bin)
relief_rank.head()


Unnamed: 0,feature,Score
0,R_VALUE,0.019608
1,EPSX,0.01699
2,EPSZ,0.016588
3,SHRGT45,0.01559
4,TOTFX,0.014957


### Recursive Feature Elimination with Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression
from src.fss.rfe.rfe import rfe_rank


logistic = LogisticRegression(solver='liblinear',random_state=777)
rfe_logistic_ranks = rfe_rank(logistic, vectorized_data, y_train_bin)
rfe_logistic_ranks.head()

Unnamed: 0,feature,Score
0,TOTBSQ,0.05
1,TOTFY,0.04321
2,TOTFZ,0.040936
3,MEANJZH,0.030837
4,SHRGT45,0.029536


### Select From Model with RandomForest

In [5]:
from src.fss.sfm.sfm import sfm_fi_rank
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50)
sfm_rf_ranks = sfm_fi_rank(rf, vectorized_data, y_train_bin)
sfm_rf_ranks.head()


Unnamed: 0,Feature,Score
0,TOTPOT,0.014754
1,SAVNCPP,0.012672
2,TOTBSQ,0.012622
3,EPSZ,0.010033
4,TOTFX,0.008288


### Select K Best with MutualInfo

In [6]:
from src.fss.skb.skb import skb_rank
from sklearn.feature_selection import mutual_info_classif

skb_mi_ranks = skb_rank(mutual_info_classif, vectorized_data, y_train_bin)
skb_mi_ranks.head()

Unnamed: 0,Feature,Score
0,R_VALUE,0.02674
1,TOTBSQ,0.023413
2,TOTPOT,0.023207
3,TOTUSJH,0.019692
4,EPSY,0.019596
