In [1]:
import numpy as np
import numba as nb
import pandas as pd
from blackops.utils.catalog import start_spark_session, f
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
from sklearn.dummy import DummyClassifier


In [2]:
from scipy.stats import truncnorm


def get_truncated_normal(mean=0, sd=1, low=0, upp=10):
    return truncnorm((low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd)

In [3]:
N = 100000

In [4]:
def get_educacion(
    edades: np.ndarray, p30=[0.1, 0.5, 0.3, 0.1], p23=[0.1, 0.4, 0.5], pelse=[0.1, 0.9]
) -> np.ndarray:
    N = edades.shape[0]
    educ = np.empty(N, dtype="<U13")
    for i in range(N):
        edad = edades[i]
        if edad > 30:
            educ[i] = np.random.choice(
                ["primaria", "secundaria", "licenciatura", "doctorado"],
                p=p30,
            )
        elif edad > 23:
            educ[i] = np.random.choice(
                ["primaria", "secundaria", "licenciatura"], p=p23
            )
        else:
            educ[i] = np.random.choice(["primaria", "secundaria"], p=pelse)
    return educ


def get_trabajo(
    edades, educaciones, p20=0.2, p2030=(0.4, 0.7), p3055=(0.8, 0.6), p55=0.1
):
    N = edades.shape[0]
    trabaj = np.empty(N, dtype=np.int8)
    for i in range(N):
        edad = edades[i]
        educacion = educaciones[i]
        if edad < 20:
            trabaj[i] = np.random.binomial(1, p=p20)
        elif 20 <= edad < 30:
            trabaj[i] = np.random.binomial(
                1, p=p2030[0] if educacion == "licenciatura" else p2030[1]
            )
        elif 30 <= edad < 55:
            trabaj[i] = np.random.binomial(
                1,
                p=p3055[0] if educacion in ("doctorado", "licenciatura") else p3055[1],
            )
        else:
            trabaj[i] = np.random.binomial(1, p=p55)
    return trabaj

In [5]:
edades = get_truncated_normal(mean=40, sd=20, low=18, upp=80).rvs(size=N).round()
educaciones = get_educacion(
    edades, p30=[0.1, 0.5, 0.3, 0.1], p23=[0.1, 0.4, 0.5], pelse=[0.1, 0.9]
)
trabajos = get_trabajo(
    edades, educaciones, p20=0.2, p2030=(0.4, 0.7), p3055=(0.8, 0.6), p55=0.1
)

df1 = pd.DataFrame(dict(edad=edades, educacion=educaciones, trabajo=trabajos))
display(df1)

Unnamed: 0,edad,educacion,trabajo
0,47.0,secundaria,0
1,37.0,doctorado,1
2,37.0,secundaria,1
3,64.0,secundaria,0
4,51.0,secundaria,0
...,...,...,...
99995,29.0,licenciatura,0
99996,27.0,secundaria,0
99997,43.0,secundaria,1
99998,41.0,secundaria,1


In [6]:
edades = get_truncated_normal(mean=55, sd=15, low=18, upp=90).rvs(size=N).round()
educaciones = get_educacion(
    edades, p30=[0.25, 0.4, 0.3, 0.05], p23=[0.2, 0.3, 0.5], pelse=[0.3, 0.7]
)
trabajos = get_trabajo(
    edades, educaciones, p20=0.1, p2030=(0.3, 0.6), p3055=(0.6, 0.5), p55=0.05
)

df2 = pd.DataFrame(dict(edad=edades, educacion=educaciones, trabajo=trabajos))
display(df2)

Unnamed: 0,edad,educacion,trabajo
0,48.0,secundaria,0
1,43.0,secundaria,0
2,76.0,secundaria,0
3,64.0,secundaria,0
4,53.0,secundaria,0
...,...,...,...
99995,47.0,primaria,0
99996,34.0,secundaria,0
99997,50.0,secundaria,1
99998,74.0,secundaria,0


In [7]:
df1["dataset"] = 0
df2["dataset"] = 1


df = pd.concat([df1, df2])
display(df)

Unnamed: 0,edad,educacion,trabajo,dataset
0,47.0,secundaria,0,0
1,37.0,doctorado,1,0
2,37.0,secundaria,1,0
3,64.0,secundaria,0,0
4,51.0,secundaria,0,0
...,...,...,...,...
99995,47.0,primaria,0,1
99996,34.0,secundaria,0,1
99997,50.0,secundaria,1,1
99998,74.0,secundaria,0,1


In [8]:
X = df.drop(columns="dataset")
y = df.dataset

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
clf = CatBoostClassifier(cat_features=["educacion"])

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

Learning rate set to 0.083408
0:	learn: 0.6746164	total: 112ms	remaining: 1m 51s
1:	learn: 0.6621615	total: 140ms	remaining: 1m 9s
2:	learn: 0.6502893	total: 183ms	remaining: 1m
3:	learn: 0.6417480	total: 274ms	remaining: 1m 8s
4:	learn: 0.6346690	total: 307ms	remaining: 1m 1s
5:	learn: 0.6285845	total: 336ms	remaining: 55.7s
6:	learn: 0.6234153	total: 361ms	remaining: 51.2s
7:	learn: 0.6177144	total: 384ms	remaining: 47.6s
8:	learn: 0.6140144	total: 412ms	remaining: 45.4s
9:	learn: 0.6109393	total: 435ms	remaining: 43.1s
10:	learn: 0.6085243	total: 460ms	remaining: 41.3s
11:	learn: 0.6065068	total: 480ms	remaining: 39.5s
12:	learn: 0.6045689	total: 502ms	remaining: 38.1s
13:	learn: 0.6028447	total: 524ms	remaining: 36.9s
14:	learn: 0.6015283	total: 545ms	remaining: 35.8s
15:	learn: 0.6004206	total: 571ms	remaining: 35.1s
16:	learn: 0.5994134	total: 594ms	remaining: 34.3s
17:	learn: 0.5983432	total: 619ms	remaining: 33.8s
18:	learn: 0.5976866	total: 642ms	remaining: 33.1s
19:	learn: 0.

0.677030303030303

In [11]:
log_loss(y_true=y_test, y_pred=clf.predict_proba(X_test))

0.5919292003090475

In [12]:
dclf = DummyClassifier()
dclf.fit(X_train, y_train)
dclf.score(X_test, y_test)

0.49716666666666665

In [13]:
log_loss(y_true=y_test, y_pred=dclf.predict_proba(X_test))

0.6931668915020539