In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.dummy import DummyClassifier
from scipy.stats import truncnorm

In [2]:
gen = np.random.default_rng(seed=42)

In [3]:
def get_truncated_normal(mean=0, sd=1, low=0, upp=10):
    return truncnorm((low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd)

In [4]:
N = 100000

In [5]:
def get_educacion(
    edades: np.ndarray, p30=[0.1, 0.5, 0.3, 0.1], p23=[0.1, 0.4, 0.5], pelse=[0.1, 0.9]
) -> np.ndarray:
    N = edades.shape[0]
    educ = np.empty(N, dtype="<U13")
    for i in range(N):
        edad = edades[i]
        if edad > 30:
            educ[i] = gen.choice(
                ["primaria", "secundaria", "licenciatura", "doctorado"],
                p=p30,
            )
        elif edad > 23:
            educ[i] = gen.choice(["primaria", "secundaria", "licenciatura"], p=p23)
        else:
            educ[i] = gen.choice(["primaria", "secundaria"], p=pelse)
    return educ


def get_trabajo(
    edades, educaciones, p20=0.2, p2030=(0.4, 0.7), p3055=(0.8, 0.6), p55=0.1
):
    N = edades.shape[0]
    trabaj = np.empty(N, dtype=np.int8)
    for i in range(N):
        edad = edades[i]
        educacion = educaciones[i]
        if edad < 20:
            trabaj[i] = gen.binomial(1, p=p20)
        elif 20 <= edad < 30:
            trabaj[i] = gen.binomial(
                1, p=p2030[0] if educacion == "licenciatura" else p2030[1]
            )
        elif 30 <= edad < 55:
            trabaj[i] = gen.binomial(
                1,
                p=p3055[0] if educacion in ("doctorado", "licenciatura") else p3055[1],
            )
        else:
            trabaj[i] = gen.binomial(1, p=p55)
    return trabaj

# Generación de datos dummy
Generaremos 2 DataFrames con mismo esquema pero distinta distribución de datos

In [6]:
edades = (
    get_truncated_normal(mean=40, sd=20, low=18, upp=80)
    .rvs(size=N, random_state=gen)
    .round()
)
educaciones = get_educacion(
    edades, p30=[0.1, 0.5, 0.3, 0.1], p23=[0.1, 0.4, 0.5], pelse=[0.1, 0.9]
)
trabajos = get_trabajo(
    edades, educaciones, p20=0.2, p2030=(0.4, 0.7), p3055=(0.8, 0.6), p55=0.1
)

df1 = pd.DataFrame(dict(edad=edades, educacion=educaciones, trabajo=trabajos))
display(df1)

Unnamed: 0,edad,educacion,trabajo
0,56.0,doctorado,0
1,40.0,doctorado,1
2,61.0,secundaria,0
3,52.0,secundaria,1
4,24.0,licenciatura,1
...,...,...,...
99995,33.0,secundaria,1
99996,46.0,licenciatura,1
99997,45.0,secundaria,1
99998,26.0,secundaria,1


In [7]:
edades = (
    get_truncated_normal(mean=55, sd=15, low=18, upp=90)
    .rvs(size=N, random_state=gen)
    .round()
)
educaciones = get_educacion(
    edades, p30=[0.25, 0.4, 0.3, 0.05], p23=[0.2, 0.3, 0.5], pelse=[0.3, 0.7]
)
trabajos = get_trabajo(
    edades, educaciones, p20=0.1, p2030=(0.3, 0.6), p3055=(0.6, 0.5), p55=0.05
)

df2 = pd.DataFrame(dict(edad=edades, educacion=educaciones, trabajo=trabajos))
display(df2)

Unnamed: 0,edad,educacion,trabajo
0,54.0,secundaria,0
1,52.0,licenciatura,1
2,66.0,secundaria,1
3,64.0,licenciatura,0
4,65.0,secundaria,0
...,...,...,...
99995,56.0,licenciatura,0
99996,26.0,primaria,0
99997,41.0,licenciatura,0
99998,38.0,licenciatura,1


Etiquetamos a cada uno de los datasets con un label y los juntamos y barajeamos

In [8]:
df1["dataset"] = 0
df2["dataset"] = 1


df = pd.concat([df1, df2]).sample(frac=1)
display(df)

Unnamed: 0,edad,educacion,trabajo,dataset
99347,24.0,secundaria,1,0
28607,53.0,primaria,0,0
48495,29.0,primaria,1,1
36338,67.0,secundaria,0,1
82188,51.0,secundaria,1,1
...,...,...,...,...
82762,49.0,secundaria,1,0
86819,55.0,licenciatura,0,1
65338,20.0,secundaria,1,0
41463,60.0,secundaria,0,0


Construimos las variables predictoras y la etiqueta objetivo, que es el identificador del dataset al que pertenece cada registro

In [9]:
X = df.drop(columns="dataset")
y = df.dataset

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

In [11]:
clf = CatBoostClassifier(cat_features=["educacion"], random_seed=42)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

Learning rate set to 0.083408
0:	learn: 0.6762550	total: 96.1ms	remaining: 1m 36s
1:	learn: 0.6631106	total: 125ms	remaining: 1m 2s
2:	learn: 0.6502494	total: 161ms	remaining: 53.5s
3:	learn: 0.6414448	total: 187ms	remaining: 46.6s
4:	learn: 0.6340117	total: 241ms	remaining: 47.9s
5:	learn: 0.6278404	total: 267ms	remaining: 44.3s
6:	learn: 0.6225709	total: 291ms	remaining: 41.3s
7:	learn: 0.6183741	total: 311ms	remaining: 38.6s
8:	learn: 0.6150032	total: 334ms	remaining: 36.8s
9:	learn: 0.6104820	total: 364ms	remaining: 36s
10:	learn: 0.6079392	total: 384ms	remaining: 34.5s
11:	learn: 0.6058305	total: 412ms	remaining: 33.9s
12:	learn: 0.6039893	total: 487ms	remaining: 37s
13:	learn: 0.6025894	total: 509ms	remaining: 35.8s
14:	learn: 0.6014946	total: 531ms	remaining: 34.9s
15:	learn: 0.6003326	total: 553ms	remaining: 34s
16:	learn: 0.5990426	total: 582ms	remaining: 33.6s
17:	learn: 0.5981673	total: 604ms	remaining: 32.9s
18:	learn: 0.5972474	total: 636ms	remaining: 32.8s
19:	learn: 0.59

0.6755454545454546

In [12]:
log_loss(y_true=y_test, y_pred=clf.predict_proba(X_test))

0.5942446833736309

In [13]:
dclf = DummyClassifier(random_state=42)
dclf.fit(X_train, y_train)
dclf.score(X_test, y_test)

0.4991060606060606

In [14]:
log_loss(y_true=y_test, y_pred=dclf.predict_proba(X_test))

0.6931491426867777