In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.dummy import DummyClassifier
from scipy.stats import truncnorm

In [2]:
gen = np.random.default_rng(seed=42)

In [3]:
def get_truncated_normal(mean=0, sd=1, low=0, upp=10):
    return truncnorm((low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd)

In [4]:
N = 100000

In [5]:
def get_educacion(
    edades: np.ndarray, p30=[0.1, 0.5, 0.3, 0.1], p23=[0.1, 0.4, 0.5], pelse=[0.1, 0.9]
) -> np.ndarray:
    N = edades.shape[0]
    educ = np.empty(N, dtype="<U13")
    for i in range(N):
        edad = edades[i]
        if edad > 30:
            educ[i] = gen.choice(
                ["primaria", "secundaria", "licenciatura", "doctorado"],
                p=p30,
            )
        elif edad > 23:
            educ[i] = gen.choice(["primaria", "secundaria", "licenciatura"], p=p23)
        else:
            educ[i] = gen.choice(["primaria", "secundaria"], p=pelse)
    return educ


def get_trabajo(
    edades, educaciones, p20=0.2, p2030=(0.4, 0.7), p3055=(0.8, 0.6), p55=0.1
):
    N = edades.shape[0]
    trabaj = np.empty(N, dtype=np.int8)
    for i in range(N):
        edad = edades[i]
        educacion = educaciones[i]
        if edad < 20:
            trabaj[i] = gen.binomial(1, p=p20)
        elif 20 <= edad < 30:
            trabaj[i] = gen.binomial(
                1, p=p2030[0] if educacion == "licenciatura" else p2030[1]
            )
        elif 30 <= edad < 55:
            trabaj[i] = gen.binomial(
                1,
                p=p3055[0] if educacion in ("doctorado", "licenciatura") else p3055[1],
            )
        else:
            trabaj[i] = gen.binomial(1, p=p55)
    return trabaj

# Generación de datos dummy
Generaremos 2 DataFrames con mismo esquema pero distinta distribución de datos

In [6]:
edades = (
    get_truncated_normal(mean=40, sd=20, low=18, upp=80)
    .rvs(size=N, random_state=gen)
    .round()
)
educaciones = get_educacion(
    edades, p30=[0.1, 0.5, 0.3, 0.1], p23=[0.1, 0.4, 0.5], pelse=[0.1, 0.9]
)
trabajos = get_trabajo(
    edades, educaciones, p20=0.2, p2030=(0.4, 0.7), p3055=(0.8, 0.6), p55=0.1
)

df1 = pd.DataFrame(dict(edad=edades, educacion=educaciones, trabajo=trabajos))
display(df1)

Unnamed: 0,edad,educacion,trabajo
0,56.0,doctorado,0
1,40.0,doctorado,1
2,61.0,secundaria,0
3,52.0,secundaria,1
4,24.0,licenciatura,1
...,...,...,...
99995,33.0,secundaria,1
99996,46.0,licenciatura,1
99997,45.0,secundaria,1
99998,26.0,secundaria,1


In [7]:
edades = (
    get_truncated_normal(mean=55, sd=15, low=18, upp=90)
    .rvs(size=N, random_state=gen)
    .round()
)
educaciones = get_educacion(
    edades, p30=[0.25, 0.4, 0.3, 0.05], p23=[0.2, 0.3, 0.5], pelse=[0.3, 0.7]
)
trabajos = get_trabajo(
    edades, educaciones, p20=0.1, p2030=(0.3, 0.6), p3055=(0.6, 0.5), p55=0.05
)

df2 = pd.DataFrame(dict(edad=edades, educacion=educaciones, trabajo=trabajos))
display(df2)

Unnamed: 0,edad,educacion,trabajo
0,54.0,secundaria,0
1,52.0,licenciatura,1
2,66.0,secundaria,1
3,64.0,licenciatura,0
4,65.0,secundaria,0
...,...,...,...
99995,56.0,licenciatura,0
99996,26.0,primaria,0
99997,41.0,licenciatura,0
99998,38.0,licenciatura,1


Etiquetamos a cada uno de los datasets con un label y los juntamos y barajeamos

In [8]:
df1["dataset"] = 0
df2["dataset"] = 1


df = pd.concat([df1, df2]).sample(frac=1)
display(df)

Unnamed: 0,edad,educacion,trabajo,dataset
19839,35.0,licenciatura,1,0
36206,64.0,primaria,0,1
17740,57.0,primaria,0,1
62764,61.0,doctorado,1,0
80720,45.0,secundaria,0,0
...,...,...,...,...
17947,56.0,secundaria,0,1
58422,44.0,primaria,0,1
84332,27.0,secundaria,1,1
57640,55.0,secundaria,0,1


Construimos las variables predictoras y la etiqueta objetivo, que es el identificador del dataset al que pertenece cada registro

In [9]:
X = df.drop(columns="dataset")
y = df.dataset

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

In [11]:
clf = CatBoostClassifier(cat_features=["educacion"], random_seed=42)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

Learning rate set to 0.083408
0:	learn: 0.6768347	total: 116ms	remaining: 1m 55s
1:	learn: 0.6639361	total: 175ms	remaining: 1m 27s
2:	learn: 0.6508327	total: 254ms	remaining: 1m 24s
3:	learn: 0.6422003	total: 281ms	remaining: 1m 10s
4:	learn: 0.6348937	total: 333ms	remaining: 1m 6s
5:	learn: 0.6290062	total: 375ms	remaining: 1m 2s
6:	learn: 0.6240134	total: 396ms	remaining: 56.2s
7:	learn: 0.6197092	total: 461ms	remaining: 57.2s
8:	learn: 0.6161209	total: 593ms	remaining: 1m 5s
9:	learn: 0.6133007	total: 671ms	remaining: 1m 6s
10:	learn: 0.6106659	total: 808ms	remaining: 1m 12s
11:	learn: 0.6074601	total: 911ms	remaining: 1m 15s
12:	learn: 0.6056385	total: 943ms	remaining: 1m 11s
13:	learn: 0.6042150	total: 974ms	remaining: 1m 8s
14:	learn: 0.6030942	total: 1.01s	remaining: 1m 6s
15:	learn: 0.6017153	total: 1.05s	remaining: 1m 4s
16:	learn: 0.6007806	total: 1.07s	remaining: 1m 2s
17:	learn: 0.5997661	total: 1.1s	remaining: 1m
18:	learn: 0.5990628	total: 1.13s	remaining: 58.4s
19:	lear

0.6783484848484849

In [12]:
log_loss(y_true=y_test, y_pred=clf.predict_proba(X_test))

0.5905098816156052

In [13]:
dclf = DummyClassifier(random_state=42)
dclf.fit(X_train, y_train)
dclf.score(X_test, y_test)

0.4988030303030303

In [14]:
log_loss(y_true=y_test, y_pred=dclf.predict_proba(X_test))

0.6931506984100798

# Y si viniesen de la misma distribución?

In [15]:
edades = (
    get_truncated_normal(mean=40, sd=20, low=18, upp=80)
    .rvs(size=N, random_state=gen)
    .round()
)
educaciones = get_educacion(
    edades, p30=[0.1, 0.5, 0.3, 0.1], p23=[0.1, 0.4, 0.5], pelse=[0.1, 0.9]
)
trabajos = get_trabajo(
    edades, educaciones, p20=0.2, p2030=(0.4, 0.7), p3055=(0.8, 0.6), p55=0.1
)

df1 = pd.DataFrame(dict(edad=edades, educacion=educaciones, trabajo=trabajos))
display(df1)

Unnamed: 0,edad,educacion,trabajo
0,45.0,secundaria,1
1,37.0,licenciatura,1
2,53.0,secundaria,1
3,52.0,secundaria,1
4,50.0,doctorado,1
...,...,...,...
99995,47.0,licenciatura,1
99996,54.0,secundaria,0
99997,42.0,secundaria,0
99998,52.0,licenciatura,1


In [16]:
edades = (
    get_truncated_normal(mean=40, sd=20, low=18, upp=80)
    .rvs(size=N, random_state=gen)
    .round()
)
educaciones = get_educacion(
    edades, p30=[0.1, 0.5, 0.3, 0.1], p23=[0.1, 0.4, 0.5], pelse=[0.1, 0.9]
)
trabajos = get_trabajo(
    edades, educaciones, p20=0.2, p2030=(0.4, 0.7), p3055=(0.8, 0.6), p55=0.1
)

df2 = pd.DataFrame(dict(edad=edades, educacion=educaciones, trabajo=trabajos))
display(df2)

Unnamed: 0,edad,educacion,trabajo
0,69.0,licenciatura,0
1,71.0,secundaria,0
2,50.0,primaria,0
3,33.0,secundaria,0
4,42.0,secundaria,1
...,...,...,...
99995,70.0,secundaria,0
99996,30.0,licenciatura,1
99997,45.0,secundaria,0
99998,77.0,secundaria,0


In [17]:
df1["dataset"] = 0
df2["dataset"] = 1


df = pd.concat([df1, df2]).sample(frac=1)
display(df)

Unnamed: 0,edad,educacion,trabajo,dataset
20775,29.0,secundaria,1,1
34234,51.0,doctorado,1,1
12940,27.0,secundaria,0,1
8542,33.0,licenciatura,1,0
30861,44.0,licenciatura,1,0
...,...,...,...,...
42454,42.0,primaria,0,0
97592,54.0,doctorado,1,0
57586,72.0,licenciatura,0,1
30273,64.0,licenciatura,0,0


In [18]:
X = df.drop(columns="dataset")
y = df.dataset

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

In [20]:
clf = CatBoostClassifier(cat_features=["educacion"], random_seed=42)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

Learning rate set to 0.083408
0:	learn: 0.6931367	total: 29.9ms	remaining: 29.9s
1:	learn: 0.6931264	total: 61.3ms	remaining: 30.6s
2:	learn: 0.6931206	total: 85.7ms	remaining: 28.5s
3:	learn: 0.6931133	total: 109ms	remaining: 27.2s
4:	learn: 0.6931082	total: 130ms	remaining: 25.8s
5:	learn: 0.6931006	total: 148ms	remaining: 24.6s
6:	learn: 0.6930932	total: 171ms	remaining: 24.2s
7:	learn: 0.6930906	total: 198ms	remaining: 24.6s
8:	learn: 0.6930780	total: 223ms	remaining: 24.6s
9:	learn: 0.6930722	total: 244ms	remaining: 24.2s
10:	learn: 0.6930640	total: 270ms	remaining: 24.3s
11:	learn: 0.6930580	total: 289ms	remaining: 23.8s
12:	learn: 0.6930552	total: 316ms	remaining: 24s
13:	learn: 0.6930552	total: 333ms	remaining: 23.4s
14:	learn: 0.6930513	total: 358ms	remaining: 23.5s
15:	learn: 0.6930492	total: 435ms	remaining: 26.7s
16:	learn: 0.6930430	total: 611ms	remaining: 35.3s
17:	learn: 0.6930328	total: 641ms	remaining: 35s
18:	learn: 0.6930328	total: 662ms	remaining: 34.2s
19:	learn: 0

0.5004848484848485

In [21]:
log_loss(y_true=y_test, y_pred=clf.predict_proba(X_test))

0.6937688488965048

In [22]:
dclf = DummyClassifier(random_state=42)
dclf.fit(X_train, y_train)
dclf.score(X_test, y_test)

0.49812121212121213

In [23]:
log_loss(y_true=y_test, y_pred=dclf.predict_proba(X_test))

0.6931558475210066

Como vemos, en este caso el log loss no disminuye al aplicar el clasificador, luego ambas distribuciones son equivalentes