In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif
from sklearn.preprocessing import MinMaxScaler

In [None]:
np.random.seed(42)

# ===========================
# 1. Crear dataset sintético (50 filas)
# ===========================

n = 50
spam = np.random.choice([0,1], size=n, p=[0.5,0.5])  # clases balanceadas

spamwords = []
links = []
hours = []
fontsize = []

for y in spam:
    if y == 1:  # Spam
        sw = np.random.randint(8, 16)         # más palabras spam
        lc = sw - np.random.randint(0,3)      # correlación con links
        hr = np.random.randint(20,24) if np.random.rand() < 0.7 else np.random.randint(0,3)
    else:       # No Spam
        sw = np.random.randint(0, 3)          # pocas palabras spam
        lc = sw + np.random.randint(0,2)      # correlación con links
        hr = np.random.randint(7, 12)         # horas de oficina

    spamwords.append(sw)
    links.append(lc)
    hours.append(hr)
    fontsize.append(np.random.choice([11,12])) # irrelevante

data = pd.DataFrame({
    "LinksCount": links,
    "SpamWords": spamwords,
    "FontSize": fontsize,
    "HourReceived": hours,
    "Clase": spam
})

X = data.drop(columns=["Clase"])
y = data["Clase"]

print("Dataset (primeras 10 filas):")
print(data.head(10))


Dataset (primeras 10 filas):
   LinksCount  SpamWords  FontSize  HourReceived  Clase
0           3          2        12             8      0
1           8          9        11             1      1
2          13         15        12            22      1
3           7          8        11            23      1
4           1          1        12            11      0
5           1          1        12            10      0
6           0          0        11            11      0
7          12         12        12            20      1
8           6          8        12             0      1
9           8         10        12             0      1


In [None]:
# ===========================
# 2. F-test
# f = var(entre_clases) / var(dentro_clase)
# resultado> F y un p para aceptar/rechazar la hipótesis. Se rechaza/acepta la utilidad
# de la caracerística
# ===========================
f_vals, f_pvals = f_classif(X, y)
print("\nF-test (ANOVA):")
for col, f, p in zip(X.columns, f_vals, f_pvals):
    print(f"{col:12s} F={f:.3f}, p={p:.4f}")


F-test (ANOVA):
LinksCount   F=238.732, p=0.0000
SpamWords    F=444.642, p=0.0000
FontSize     F=3.261, p=0.0772
HourReceived F=15.468, p=0.0003


In [None]:
# ===========================
# 3. Chi²
# ===========================
scaler = MinMaxScaler()
X_chi2 = scaler.fit_transform(X)
chi2_vals, chi2_pvals = chi2(X_chi2, y)

print("\nChi² test:")
for col, c, p in zip(X.columns, chi2_vals, chi2_pvals):
    print(f"{col:12s} Chi²={c:.3f}, p={p:.4f}")


Chi² test:
LinksCount   Chi²=12.301, p=0.0005
SpamWords    Chi²=15.869, p=0.0001
FontSize     Chi²=1.718, p=0.1900
HourReceived Chi²=2.320, p=0.1278


In [None]:
# ===========================
# 4. Information Gain (Mutual Information)
# ===========================
mi = mutual_info_classif(X, y, random_state=42)
print("\nInformation Gain (Mutual Information):")
for col, val in zip(X.columns, mi):
    print(f"{col:12s} MI={val:.3f}")


Information Gain (Mutual Information):
LinksCount   MI=0.696
SpamWords    MI=0.696
FontSize     MI=0.000
HourReceived MI=0.696


In [None]:
# ===========================
# 5. Correlation entre variables
# ===========================
print("\nCorrelation matrix:")
print(data.drop(columns=["Clase"]).corr())


Correlation matrix:
              LinksCount  SpamWords  FontSize  HourReceived
LinksCount      1.000000   0.987609 -0.209852      0.489144
SpamWords       0.987609   1.000000 -0.219787      0.516053
FontSize       -0.209852  -0.219787  1.000000     -0.276053
HourReceived    0.489144   0.516053 -0.276053      1.000000
