In [31]:
import numpy as np
import pandas as pd
from multiprocessing import cpu_count


import xgboost
from xgboost import XGBClassifier

In [32]:
n_jobs = cpu_count()
inp_path = "data/"
sub_path = "submissions/"
%run src/import_parameters.py
TARGET = pd.read_csv(sub_path + "submission_target_00.csv")["predicted"]

# Data Load

In [33]:
metadata = pd.read_csv(inp_path + 'metadata.csv')
test = pd.read_csv(inp_path + 'test.csv')
train = pd.read_csv(inp_path + 'train.csv')
train = train.sample(frac=1, random_state=0).reset_index(drop=True)  # shuffling dataset
sub = pd.read_csv(inp_path + 'submission_sample.csv')
feature_selection = pd.read_csv("data/f1_score_for_features.csv")

###########################

# Excluindo colunas por terem muitos missings
train = train.drop(columns = ["var60","var65","var66"],errors="ignore")
test = test.drop(columns = ["var60","var65","var66"],errors="ignore")


cols_to_drop = feature_selection[feature_selection["action"]=="drop"]["feature"].to_list()
train = train.drop(columns = cols_to_drop,errors="ignore")
test = test.drop(columns = cols_to_drop,errors="ignore")

# train = train.replace(-999,np.nan)
# nans_count = train.isna().mean().to_dict()

# cols_to_flag = []

# for k in dict(filter(lambda x: x[1] > 0.10,nans_count.items())):
#     if nans_count[k] > 0:
#         #cols_to_flag.append(k)
#         train[k+"_nan_flag"] = train[k].isna().astype(int)
#         test[k+"_nan_flag"] = test[k].isna().astype(int)

# Importando Parâmetros

In [34]:
from xtlearn.utils import load_pickle, dump_pickle

# Hyperparamaters
hyparams = yaml.safe_load(open("data/hyperparameters.yaml", "r"))
EARLY_STOPPING_ROUNDS = hyparams["EARLY_STOPPING_ROUNDS"]
EVAL_METRIC = hyparams["EVAL_METRIC"]
VERBOSE = hyparams["VERBOSE"]
RANDOM_STATE = hyparams["RANDOM_STATE"]
N_FOLDS = hyparams["N_FOLDS"]
SCORING = f1_score if hyparams["SCORING"] == "f1_score" else None
THRESHOLD = hyparams["THRESHOLD"]

# Parameters space
parameters = yaml.safe_load(open("data/parameters.yaml", "r"))
space = [generate_space_dimension(x) for x in parameters]
PARAMETER_NAMES = [elem.name for elem in space]

# Parametros reais
params_real = list(filter(lambda x: x['type']=='real',parameters))
params = {x['parameter']:x['estimate']for x in params_real}
limits = {x['parameter']:x['range']for x in params_real}
steps = {x['parameter']:x['step']for x in params_real}
space = [generate_space_dimension(x) for x in params_real]
PARAMETER_NAMES = [elem.name for elem in space]

# Parametros inteiros
params_disc = list(filter(lambda x: x['type']!='real',parameters))
DISC_PAR = { _['parameter']: _['estimate'] for _ in params_disc}

# x0 = list(params.values())
# y0 = 2

# dump_pickle(space,"data/best_space.pkl")
# dump_pickle(PARAMETER_NAMES,"data/best_PARAMETER_NAMES.pkl")
# dump_pickle(x0,"data/best_x0.pkl")
# dump_pickle(score,"data/best_y0.pkl")

# Rodando Modelo

In [35]:
estimator = XGBClassifier(
    n_jobs=n_jobs,
    random_state=RANDOM_STATE,
    **params,
    **DISC_PAR
)

fit_params = {
    "early_stopping_rounds": EARLY_STOPPING_ROUNDS,
    "eval_metric": EVAL_METRIC,
    "verbose": VERBOSE,
}


print(estimator)

cv = cross_validate_score(
    X=train.drop(["y","id"], 1,errors="ignore"),
    y=train["y"],
    estimator=estimator,
    fit_params=fit_params,
    n_folds=N_FOLDS,
    scoring=SCORING,
    threshold=THRESHOLD,
    random_state=RANDOM_STATE,
    verbose=0,
)
print(cv)

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=0.8332353261601001,
              colsample_bytree=0.33470182526412845, gamma=None, gpu_id=None,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.05332448042968835, max_delta_step=0, max_depth=6,
              min_child_weight=0.36081712757499335, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=16,
              num_parallel_tree=2, random_state=42,
              reg_alpha=0.9368252292759284, reg_lambda=0.9673847226568928,
              scale_pos_weight=None, subsample=0.9359040571480106,
              tree_method=None, validate_parameters=None, verbosity=None)
-0.682572256275777


In [44]:
# Separando os dados de treinamento para essa fold
train_data = train.copy()

# Separando os dados de teste para esse fold
test_data = test.copy()

# Dados para o treinamento
X_train = train_data.drop(columns=["fold", "y","id"],errors='ignore').values
y_train = train_data["y"].values

# Features para predição
X_test = test_data.drop(columns=["fold", "y","id"],errors='ignore').values

# Melhor estimativa
y_test = TARGET.values

fit_params.pop('eval_set')
fit_params["eval_set"] = [(X_test, y_test)]

estimator.fit(X_train, y_train, **fit_params)

y_pred = (estimator.predict_proba(X_test)[:,1] > THRESHOLD).astype(int)

# y_pred = predict(estimator, X_test, threshold=threshold).astype(int)

corr = pd.DataFrame({0: TARGET, 1: y_pred}).corr()[0][1]

print("Correlação com a estimativa:",corr)

pd.read_csv(sub_path + "submission_target_00.csv")[["id"]].assign(predicted=y_pred).to_csv(
    sub_path + "submission_final.csv", index=False
)

print("Correlação com a melhor submissão:",np.corrcoef(
pd.read_csv(sub_path + "submission_best.csv")['predicted'],
pd.read_csv(sub_path + "submission_final.csv")['predicted'])[0][1])

Correlação com a estimativa: 0.9857438688165551
Correlação com a melhor submissão: 1.0
