In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectFdr
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import StandardScaler


In [2]:
df = pd.read_csv("../../Datasets/Arrhythmia/arrhythmia.data",
                 header=None, low_memory=False)
df_column_names = pd.read_csv("../../Datasets/Arrhythmia/labels.names", header=None)

df.rename(columns=dict([(i, df_column_names[0][i]) for i in range(df.shape[1])]), inplace=True)

for i in [2, 22, 23, 24, 25, 26, 27, 34, 35, 36, 37, 38, 39, 46, 47,
          48, 49, 50, 51, 58, 59, 60, 61, 62, 63, 70, 71, 72, 73, 74, 75, 82, 83, 84, 85, 86, 87,
          94, 95, 96, 97, 98, 99, 106, 107, 108, 109, 110, 111, 118, 119, 120, 121, 122, 123,
          130, 131, 132, 133, 134, 135, 142, 143, 144, 145, 146, 147, 154, 155, 156, 157, 158, 159]:
    df.drop(df_column_names[0][i - 1], axis=1, inplace=True)
df.replace("?", np.nan, inplace=True)

for column in df.columns:
    if df[column].dtype == "object":
        df[column] = df[column].astype("float")

    if df[column].dtype == "float64":
        df[column].fillna(df[column].mean(), inplace=True)

    if len(np.unique(df[column])) == 1:
        df.drop(column, axis=1, inplace=True)

df.head()


Unnamed: 0,Age,Height,Weight,QRS duration,P-R interval,Q-T interval,T interval,P interval,QRS,T,...,V6 Amplitude: JJ wave,V6 Amplitude: Q wave,V6 Amplitude: R wave,V6 Amplitude: S wave,V6 Amplitude: R' wave,V6 Amplitude: P wave,V6 Amplitude: T wave,V6 QRSA,V6 QRSTA,Class
0,75,190,80,91,193,371,174,121,-16,13.0,...,-0.3,0.0,9.0,-0.9,0.0,0.9,2.9,23.3,49.4,8
1,56,165,64,81,174,401,149,39,25,37.0,...,-0.5,0.0,8.5,0.0,0.0,0.2,2.1,20.4,38.8,6
2,54,172,95,138,163,386,185,102,96,34.0,...,0.9,0.0,9.5,-2.4,0.0,0.3,3.4,12.3,49.0,10
3,55,175,94,100,202,380,179,143,28,11.0,...,0.1,0.0,12.2,-2.2,0.0,0.4,2.6,34.6,61.6,1
4,75,190,80,88,181,360,177,103,-16,13.0,...,-0.4,0.0,13.1,-3.6,0.0,-0.1,3.9,25.4,62.8,7


In [3]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

scaler = StandardScaler()
X = scaler.fit_transform(X)

significance_level = 0.05
false_discovery_rate = SelectFdr(f_classif, alpha=significance_level)
false_discovery_rate.fit(X, y)

selected_features_mask = false_discovery_rate.get_support()
selected_feature_indices = [-1] + [i for (i, x) in enumerate(selected_features_mask) if x]

print("Before ANOVA feature selection:", df.shape)
print("After ANOVA feature selection:", df.iloc[:, selected_feature_indices].shape)

df.iloc[:, selected_feature_indices].head()


Before ANOVA feature selection: (452, 199)
After ANOVA feature selection: (452, 170)


Unnamed: 0,Class,Age,Height,Weight,QRS duration,P-R interval,Q-T interval,T interval,P interval,QRS,...,V5 QRSA,V5 QRSTA,V6 Amplitude: JJ wave,V6 Amplitude: Q wave,V6 Amplitude: R wave,V6 Amplitude: S wave,V6 Amplitude: P wave,V6 Amplitude: T wave,V6 QRSA,V6 QRSTA
0,8,75,190,80,91,193,371,174,121,-16,...,25.5,62.9,-0.3,0.0,9.0,-0.9,0.9,2.9,23.3,49.4
1,6,56,165,64,81,174,401,149,39,25,...,21.6,43.4,-0.5,0.0,8.5,0.0,0.2,2.1,20.4,38.8
2,10,54,172,95,138,163,386,185,102,96,...,11.5,48.2,0.9,0.0,9.5,-2.4,0.3,3.4,12.3,49.0
3,1,55,175,94,100,202,380,179,143,28,...,36.8,68.0,0.1,0.0,12.2,-2.2,0.4,2.6,34.6,61.6
4,7,75,190,80,88,181,360,177,103,-16,...,21.7,48.9,-0.4,0.0,13.1,-3.6,-0.1,3.9,25.4,62.8


In [4]:
df_new = df.iloc[:, selected_feature_indices]
df_new.to_csv("../../DatasetsFeatureSelection/Arrhythmia/arrhythmia_anova.csv", index=False)

df_new.head()


Unnamed: 0,Class,Age,Height,Weight,QRS duration,P-R interval,Q-T interval,T interval,P interval,QRS,...,V5 QRSA,V5 QRSTA,V6 Amplitude: JJ wave,V6 Amplitude: Q wave,V6 Amplitude: R wave,V6 Amplitude: S wave,V6 Amplitude: P wave,V6 Amplitude: T wave,V6 QRSA,V6 QRSTA
0,8,75,190,80,91,193,371,174,121,-16,...,25.5,62.9,-0.3,0.0,9.0,-0.9,0.9,2.9,23.3,49.4
1,6,56,165,64,81,174,401,149,39,25,...,21.6,43.4,-0.5,0.0,8.5,0.0,0.2,2.1,20.4,38.8
2,10,54,172,95,138,163,386,185,102,96,...,11.5,48.2,0.9,0.0,9.5,-2.4,0.3,3.4,12.3,49.0
3,1,55,175,94,100,202,380,179,143,28,...,36.8,68.0,0.1,0.0,12.2,-2.2,0.4,2.6,34.6,61.6
4,7,75,190,80,88,181,360,177,103,-16,...,21.7,48.9,-0.4,0.0,13.1,-3.6,-0.1,3.9,25.4,62.8
