In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFdr


In [3]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/internet_ads/ad.data",
                 header=None, low_memory=False)
df_column_names = pd.read_csv("../labels/internet_ads.names", header=None)

df.rename(columns=dict([(column, df_column_names[0][column]) for column in range(df.shape[1])]), inplace=True)
df["ad"].replace("ad.", 1, inplace=True)
df["ad"].replace("nonad.", 0, inplace=True)
df.replace("?", np.nan, inplace=True)
df.replace("   ?", np.nan, inplace=True)
df.dropna(inplace=True)

for column in df.columns:
    if df[column].dtype == "object":
        df[column] = df[column].astype("float")

df.head()


Unnamed: 0,height,width,aratio,local,url*images+buttons,url*likesbookscom,url*wwwslakecom,url*hydrogeologist,url*oso,url*media,...,caption*home,caption*my,caption*your,caption*in,caption*bytes,caption*here,caption*click,caption*for,caption*you,ad
0,125.0,125.0,1.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,57.0,468.0,8.2105,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,33.0,230.0,6.9696,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,60.0,468.0,7.8,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,60.0,468.0,7.8,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
for column in df.columns:
    if df[column].dtype == "float64":
        df[column] = pd.qcut(df[column], q=5, labels=False, duplicates="drop").astype("category")

df.head()


Unnamed: 0,height,width,aratio,local,url*images+buttons,url*likesbookscom,url*wwwslakecom,url*hydrogeologist,url*oso,url*media,...,caption*home,caption*my,caption*your,caption*in,caption*bytes,caption*here,caption*click,caption*for,caption*you,ad
0,4,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,4,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,4,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2,4,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2,4,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [5]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
print("Before chi2 feature selection:", X.shape)

significance_level = 0.05
false_discovery_rate = SelectFdr(chi2, alpha=significance_level)
X_new = false_discovery_rate.fit_transform(X, y)
print("After chi2 feature selection:", X_new.shape)

selected_features_mask = false_discovery_rate.get_support()
selected_feature_indices = [i for (i, x) in enumerate(selected_features_mask) if x]

df.iloc[:, selected_feature_indices + [-1]].head()


Before chi2 feature selection: (2359, 1558)
After chi2 feature selection: (2359, 460)


Unnamed: 0,height,width,aratio,url*media,url*blipverts,url*advertising+blipverts,url*cnet,url*time+1998,url*tvgencom,url*ad+gif,...,caption*and,caption*click+here,caption*the,caption*here+for,caption*your,caption*here,caption*click,caption*for,caption*you,ad
0,4,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,4,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,4,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2,4,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2,4,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
