In [4]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFdr


In [5]:
df = pd.read_csv("../../datasets/InternetAdvertisements/ad.data",
                 header=None, low_memory=False)
df_column_names = pd.read_csv("../../datasets/InternetAdvertisements/ad.names", header=None)

df.rename(columns=dict([(column, df_column_names[0][column]) for column in range(df.shape[1])]), inplace=True)
df["ad"].replace("ad.", 1, inplace=True)
df["ad"].replace("nonad.", 0, inplace=True)
df.replace("?", np.nan, inplace=True)
df.replace("   ?", np.nan, inplace=True)
df.dropna(inplace=True)

for column in df.columns:
    if df[column].dtype == "object":
        df[column] = df[column].astype("float")
    if df[column].dtype == "float64":
        df[column] = pd.qcut(df[column], q=2, labels=False, duplicates="drop").astype("category")
    if df[column].dtype == "int64":
        df[column] = df[column].astype("category")

df.head()


Unnamed: 0,height,width,aratio,local,url*images+buttons,url*likesbookscom,url*wwwslakecom,url*hydrogeologist,url*oso,url*media,...,caption*home,caption*my,caption*your,caption*in,caption*bytes,caption*here,caption*click,caption*for,caption*you,ad
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

significance_level = 0.05
false_discovery_rate = SelectFdr(chi2, alpha=significance_level)
false_discovery_rate.fit(X, y)

selected_features_mask = false_discovery_rate.get_support()
selected_feature_indices = [-1] + [i for (i, x) in enumerate(selected_features_mask) if x]

print("Before chi2 feature selection:", df.shape)
print("After chi2 feature selection:", df.iloc[:, selected_feature_indices].shape)
df.iloc[:, selected_feature_indices].head()


Before chi2 feature selection: (2359, 1559)
After chi2 feature selection: (2359, 461)


Unnamed: 0,ad,height,width,aratio,url*media,url*blipverts,url*advertising+blipverts,url*cnet,url*time+1998,url*tvgencom,...,alt*home,caption*and,caption*click+here,caption*the,caption*here+for,caption*your,caption*here,caption*click,caption*for,caption*you
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
df_new = df.iloc[:, selected_feature_indices]
df_new.to_csv("datasets/internet_ads.csv", index=False)

df_new.head()


Unnamed: 0,ad,height,width,aratio,url*media,url*blipverts,url*advertising+blipverts,url*cnet,url*time+1998,url*tvgencom,...,alt*home,caption*and,caption*click+here,caption*the,caption*here+for,caption*your,caption*here,caption*click,caption*for,caption*you
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
