In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [25]:
df = pd.read_csv("./data/Titanic Dataset.csv")
df = df.dropna(axis=1, thresh=0.3 * len(df))
y = df["survived"]
X = df.drop("survived", axis=1)

for col in X.columns:
    if X[col].dtype == "object":
        X[col].fillna(X[col].mode()[0], inplace=True)
    else:
        X[col].fillna(X[col].median(), inplace=True)

for col in X.select_dtypes("object"):
    X[col], _ = pd.factorize(X[col])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

numeric_cols = X_train.select_dtypes(np.number).columns.tolist()



def q_equal_width(xtr, xte, k):
    bins = np.linspace(xtr.min(), xtr.max(), k+1)
    return np.clip(np.digitize(xtr, bins)-1, 0, k-1), np.clip(np.digitize(xte, bins)-1, 0, k-1)

def q_equal_freq(xtr, xte, k):
    qs = np.quantile(xtr, np.linspace(0, 1, k+1))
    return np.clip(np.digitize(xtr, qs)-1, 0, k-1), np.clip(np.digitize(xte, qs)-1, 0, k-1)

def q_kmeans(xtr, xte, k):
    km = KMeans(n_clusters=k, random_state=42, n_init=10).fit(xtr.values.reshape(-1,1))
    return km.predict(xtr.values.reshape(-1,1)), km.predict(xte.values.reshape(-1,1))


def evaluate(method, k):
    Xt = X_train.copy()
    Xs = X_test.copy()

    for col in numeric_cols:
        if method == "width":
            t, s = q_equal_width(Xt[col], Xs[col], k)
        elif method == "freq":
            t, s = q_equal_freq(Xt[col], Xs[col], k)
        else:
            t, s = q_kmeans(Xt[col], Xs[col], k)

        Xt[col] = t
        Xs[col] = s

    model = CategoricalNB()
    model.fit(Xt, y_train)
    pred = model.predict(Xs)

    return accuracy_score(y_test, pred)



methods = ["width", "freq", "kmeans"]
ks = [2, 5, 10, 25]

res = []
for m in methods:
    for k in ks:
        acc = evaluate(m, k)
        print(f"{m} k={k}: acc={acc:.4f}")
        res.append([m, k, acc])


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

width k=2: acc=0.8740
width k=5: acc=0.8359
width k=10: acc=0.8664
width k=25: acc=0.8740
freq k=2: acc=0.7328
freq k=5: acc=0.8015
freq k=10: acc=0.8664
freq k=25: acc=0.8702
kmeans k=2: acc=0.8779
kmeans k=5: acc=0.8626
kmeans k=10: acc=0.8702
kmeans k=25: acc=0.8855


In [26]:
df_res = pd.DataFrame(res, columns=["Method", "k", "Accuracy"])
print(df_res)

    Method   k  Accuracy
0    width   2  0.874046
1    width   5  0.835878
2    width  10  0.866412
3    width  25  0.874046
4     freq   2  0.732824
5     freq   5  0.801527
6     freq  10  0.866412
7     freq  25  0.870229
8   kmeans   2  0.877863
9   kmeans   5  0.862595
10  kmeans  10  0.870229
11  kmeans  25  0.885496
