In [139]:
library(readr)
library(MASS)
library(dplyr)
library(xgboost)
library(Matrix)
library(rbcb)
library(lubridate)

ERROR: Error in library(writexl): there is no package called ‘writexl’


In [None]:
base = read_csv('base_concorrencia_bq.csv')

### Prazo-Class

In [None]:
base = base %>%
  mutate(
    grupo_prazo = case_when(
      is.na(prazo_scr) ~ NA_character_,
      prazo_scr <= 12 ~ '1',
      prazo_scr > 12 & prazo_scr <= 24 ~ '2',
      prazo_scr > 24 ~ '3'      
    )
  )

base <- base %>%
  mutate(
    label_prazo = as.integer(grupo_prazo) - 1L)

base$grupo_prazo <- factor(base$grupo_prazo, levels = as.character(1:3))

base$label_prazo <- ifelse(
  is.na(base$grupo_prazo),
  NA_integer_,
  as.integer(base$grupo_prazo) - 1L
)


In [None]:
bancos <- c('Banco Inter', 'Banco do Brasil', 'Bradesco', 'C6 Bank', 
            'Caixa', 'Cooperativas', 'Itaú', 'Mercado Pago', 
            'Nubank', 'Pagseguro', 'Santander')

lista_bancos <- vector("list", length(bancos))
names(lista_bancos) <- bancos 

for (nome_banco in bancos){

    print(nome_banco)
    base_ = base %>% filter(IF_adj==nome_banco,prazo_scr <= 60, !is.na(mediana_tpv_3m), mediana_tpv_3m > 1) %>% mutate(grupo_cnae = substr(cnae_fiscal_principal,1,3),std_dev_debt_3m = tidyr::replace_na(std_dev_debt_3m,0),FullDebt_3M = tidyr::replace_na(FullDebt_3M,0),IsAcquirerActive = tidyr::replace_na(IsAcquirerActive,FALSE)) %>% mutate(FullDebt_3M_2=FullDebt_3M**2,mediana_tpv_3m_2=mediana_tpv_3m**2,desembolso_scr_2=desembolso_scr**2) %>% mutate(razao_debt_tpv = FullDebt_3M/mediana_tpv_3m,razao_tpv_if = mediana_tpv_3m/n_if) 
    base_train = base_ %>% filter(!is.na(prazo_scr))
    
    base__ = base %>% filter(IF_adj==nome_banco, !is.na(mediana_tpv_3m), mediana_tpv_3m > 1) %>% mutate(grupo_cnae = substr(cnae_fiscal_principal,1,3),std_dev_debt_3m = tidyr::replace_na(std_dev_debt_3m,0),FullDebt_3M = tidyr::replace_na(FullDebt_3M,0),IsAcquirerActive = tidyr::replace_na(IsAcquirerActive,FALSE)) %>% mutate(FullDebt_3M_2=FullDebt_3M**2,mediana_tpv_3m_2=mediana_tpv_3m**2,desembolso_scr_2=desembolso_scr**2) %>% mutate(razao_debt_tpv = FullDebt_3M/mediana_tpv_3m,razao_tpv_if = mediana_tpv_3m/n_if) 
    base_pred = base__ %>% filter(is.na(prazo_scr))
    
    idx <- sample(seq_len(nrow(base_train)), size = floor(0.8*nrow(base_train)))
    train_df <- base_train[idx, ]
    valid_df <- base_train[-idx, ]
    
    fml <- ~ factor(IsAcquirerActive) + factor(Default_Month_Prior) + FullDebt_3M + factor(month_year) + Cont_Socios + razao_debt_tpv + desembolso_scr + mediana_tpv_3m + factor(ano_inicio_atividade) + Cont_Socios 
    
    X_train <- sparse.model.matrix(fml, data = train_df)[, -1]
    X_valid <- sparse.model.matrix(fml, data = valid_df)[, -1]

    y_train <- train_df$label_prazo
    y_valid <- valid_df$label_prazo
    
    dtrain <- xgb.DMatrix(data = X_train, label = y_train)
    dvalid <- xgb.DMatrix(data = X_valid, label = y_valid)
    
    params <- list(
    objective = "multi:softmax",
    eval_metric = "merror",
    max_depth = 3,
    min_child_weight = 10,
    eta = 0.5,
    subsample = 0.8,
    colsample_bytree = 0.8,
    num_class = 3,
    seed = 1234)

    bst <- xgb.train(
    params = params,
    data = dtrain,
    nrounds = 400,
    watchlist = list(train = dtrain, valid = dvalid),
    early_stopping_rounds = 10,
    verbose = 2)
    
    pred_test <- predict(bst, dvalid)
    print(as.data.frame(cbind(pred_test,y_valid)) %>% mutate(acerto = if_else(pred_test == y_valid,1,0)) %>% summarise(acertos = sum(acerto)/n()))
        
    feature_names <- colnames(X_train)
    factor_vars <- c("month_year")

    # 1) garante factor no treino
    for (v in factor_vars) {
      train_df[[v]] <- as.factor(train_df[[v]])
    }

    # 2) captura níveis
    train_levels <- lapply(train_df[, factor_vars, drop = FALSE], levels)
    
    for (v in names(train_levels)) {
      base_pred[[v]] <- factor(base_pred[[v]], levels = train_levels[[v]])
    }

    base_pred <- base_pred %>% 
      tidyr::drop_na(FullDebt_3M, desembolso_scr, month_year, 
        Cont_Socios, razao_debt_tpv
      )

    X_pred <- sparse.model.matrix(fml, data = base_pred)[, -1]

    missing_cols <- setdiff(feature_names, colnames(X_pred))
    if (length(missing_cols) > 0) {
      add_mat <- Matrix::Matrix(0, nrow = nrow(X_pred), ncol = length(missing_cols), sparse = TRUE)
      colnames(add_mat) <- missing_cols
      X_pred <- cbind(X_pred, add_mat)
    }

    # Remove colunas extras (apareceram no novo mas não existem no treino)
    extra_cols <- setdiff(colnames(X_pred), feature_names)
    if (length(extra_cols) > 0) {
      X_pred <- X_pred[, setdiff(colnames(X_pred), extra_cols), drop = FALSE]
    }

    # Reordena exatamente como no treino
    X_pred <- X_pred[, feature_names, drop = FALSE]

    dpred <- xgb.DMatrix(X_pred)

    pred <- predict(bst, dpred)

    base_pred$label_prazo = pred
    
    lista_bancos[[nome_banco]] <- rbind(base_pred,base_train)

}

In [None]:
base_prazo_filled = bind_rows(lista_bancos)

In [None]:
base_prazo_filled %>% write_csv(.,'base_prazo_filled.csv')

### Prazo-Reg

In [None]:
bancos <- c('Banco Inter', 'Banco do Brasil', 'Bradesco', 'C6 Bank', 
            'Caixa', 'Cooperativas', 'Itaú', 'Mercado Pago', 
            'Nubank', 'Pagseguro', 'Santander')

lista_bancos <- vector("list", length(bancos))
names(lista_bancos) <- bancos 

for (nome_banco in bancos){

    print(nome_banco)
    base_ = base %>% filter(IF_adj==nome_banco,prazo_scr <= 60, !is.na(mediana_tpv_3m), mediana_tpv_3m > 1) %>% mutate(grupo_cnae = substr(cnae_fiscal_principal,1,3),std_dev_debt_3m = tidyr::replace_na(std_dev_debt_3m,0),FullDebt_3M = tidyr::replace_na(FullDebt_3M,0),IsAcquirerActive = tidyr::replace_na(IsAcquirerActive,FALSE)) %>% mutate(FullDebt_3M_2=FullDebt_3M**2,mediana_tpv_3m_2=mediana_tpv_3m**2,desembolso_scr_2=desembolso_scr**2) %>% mutate(razao_debt_tpv = FullDebt_3M/mediana_tpv_3m,razao_tpv_if = mediana_tpv_3m/n_if) 
    base_train = base_ %>% filter(!is.na(prazo_scr))

    base__ = base %>% filter(IF_adj==nome_banco, !is.na(mediana_tpv_3m), mediana_tpv_3m > 1) %>% mutate(grupo_cnae = substr(cnae_fiscal_principal,1,3),std_dev_debt_3m = tidyr::replace_na(std_dev_debt_3m,0),FullDebt_3M = tidyr::replace_na(FullDebt_3M,0),IsAcquirerActive = tidyr::replace_na(IsAcquirerActive,FALSE)) %>% mutate(FullDebt_3M_2=FullDebt_3M**2,mediana_tpv_3m_2=mediana_tpv_3m**2,desembolso_scr_2=desembolso_scr**2) %>% mutate(razao_debt_tpv = FullDebt_3M/mediana_tpv_3m,razao_tpv_if = mediana_tpv_3m/n_if) 
    base_pred = base__ %>% filter(is.na(prazo_scr))
    
    idx <- sample(seq_len(nrow(base_train)), size = floor(0.9*nrow(base_train)))
    train_df <- base_train[idx, ]
    valid_df <- base_train[-idx, ]

    fml <- ~ factor(IsAcquirerActive) + factor(Default_Month_Prior) + log(std_dev_debt_3m+1) + log(FullDebt_3M+1) + log(FullDebt_3M_2+1) + log(mediana_tpv_3m) + log(mediana_tpv_3m_2) + log(desembolso_scr) + log(desembolso_scr_2) + factor(month_year) + n_if +
      factor(ano_inicio_atividade) + factor(uf) +
      Cont_Socios + factor(grupo_cnae) + razao_debt_tpv + log(razao_tpv_if+1)

    X_train <- sparse.model.matrix(fml, data = train_df)[, -1]
    X_valid <- sparse.model.matrix(fml, data = valid_df)[, -1]

    y_train <- train_df$prazo_scr
    y_valid <- valid_df$prazo_scr

    y_train_log <- log1p(y_train)
    y_valid_log <- log1p(y_valid)

    dtrain <- xgb.DMatrix(X_train, label = y_train_log)
    dvalid <- xgb.DMatrix(X_valid, label = y_valid_log)

    params <- list(
      objective = "reg:absoluteerror",
      eval_metric = "mae",
      max_depth = 6,
      min_child_weight = 5,
      eta = 0.05,
      subsample = 0.8,
      colsample_bytree = 0.8,
      seed = 1234)

    bst <- xgb.train(
    params = params,
    data = dtrain,
    nrounds = 800,
    watchlist = list(train = dtrain, valid = dvalid),
    early_stopping_rounds = 30,
    verbose = 2)
    
    pred_log <- predict(bst, dvalid)
    pred <- expm1(pred_log)

    print(mean(abs(pred - y_valid)))
    
    breaks <- seq(0, 60, by = 10)
    
    bins <- cut(
      y_valid,
      breaks = breaks,
      right = FALSE,
      include.lowest = TRUE
    )

    tab <- table(bins)
    tab_perc <- prop.table(tab)

    res <- data.frame(
      faixa = names(tab),
      n = as.integer(tab),
      perc = round(100 * as.numeric(tab_perc), 1)
    )
    
    plot(
      y_valid, pred,
      xlab = "Prazo real (meses)",
      ylab = "Prazo previsto (meses)",
      main = sprintf("Predito vs Real (com distribuição do prazo real) %s", as.character(nome_banco)),
      pch = 16, col = rgb(0, 0, 0, 0.3)
    )

    abline(0, 1, col = "red", lwd = 2)

    # Linhas verticais das faixas
    abline(v = breaks, col = "grey80", lty = 3)

    # Anotar percentuais no topo
    y_top <- max(pred, na.rm = TRUE) * 1.02

    for (i in seq_along(tab)) {
      if (tab[i] > 0) {
        x_mid <- (breaks[i] + breaks[i + 1]) / 2
        text(
          x = x_mid,
          y = y_top,
          labels = paste0(round(100 * tab[i] / sum(tab), 1), "%"),
          cex = 0.8
        )
      }
    }


    feature_names <- colnames(X_train)
    factor_vars <- c("IsAcquirerActive","Default_Month_Prior","month_year",
                     "ano_inicio_atividade","uf","grupo_cnae")

    # 1) garante factor no treino
    for (v in factor_vars) {
      train_df[[v]] <- as.factor(train_df[[v]])
    }

    # 2) captura níveis
    train_levels <- lapply(train_df[, factor_vars, drop = FALSE], levels)
    
    for (v in names(train_levels)) {
      base_pred[[v]] <- factor(base_pred[[v]], levels = train_levels[[v]])
    }

    base_pred <- base_pred %>% 
      tidyr::drop_na(
        ano_inicio_atividade, grupo_cnae, IsAcquirerActive, 
        Default_Month_Prior, std_dev_debt_3m, FullDebt_3M, 
        mediana_tpv_3m, desembolso_scr, month_year, n_if, 
        uf, Cont_Socios, razao_debt_tpv, razao_tpv_if
      )

    X_pred <- sparse.model.matrix(fml, data = base_pred)[, -1]

    # Adiciona colunas faltantes (que existiam no treino mas não apareceram no novo)
    missing_cols <- setdiff(feature_names, colnames(X_pred))
    if (length(missing_cols) > 0) {
      add_mat <- Matrix::Matrix(0, nrow = nrow(X_pred), ncol = length(missing_cols), sparse = TRUE)
      colnames(add_mat) <- missing_cols
      X_pred <- cbind(X_pred, add_mat)
    }

    # Remove colunas extras (apareceram no novo mas não existem no treino)
    extra_cols <- setdiff(colnames(X_pred), feature_names)
    if (length(extra_cols) > 0) {
      X_pred <- X_pred[, setdiff(colnames(X_pred), extra_cols), drop = FALSE]
    }

    # Reordena exatamente como no treino
    X_pred <- X_pred[, feature_names, drop = FALSE]

    dpred <- xgb.DMatrix(X_pred)

    pred_log <- predict(bst, dpred)

    pred_prazo <- expm1(pred_log)  

    base_pred$prazo_scr = round(pred_prazo)
    
    lista_bancos[[nome_banco]] <- rbind(base_pred,base_train)
}

In [None]:
base_prazo_filled = bind_rows(lista_bancos)

In [None]:
base_prazo_filled %>% write_csv(.,'base_prazo_filled.csv')

### Taxa

In [2]:
base_prazo_filled = read_csv('base_prazo_filled.csv')

[1mRows: [22m[34m162385[39m [1mColumns: [22m[34m39[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m   (5): IF_adj, RootDocumentNumber, uf, flag_operacao, document
[32mdbl[39m  (30): flag_desembolsou_stone, desembolso_scr, limite_stone, prazo_scr, ...
[33mlgl[39m   (2): IsAcquirerActive, Default_Month_Prior
[34mdate[39m  (2): month_year, reference_month

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [3]:
colnames(base_prazo_filled)

In [4]:
base_prazo_filled = base_prazo_filled %>% mutate(flag_filled = case_when(!is.na(taxa_scr) ~ 0, TRUE ~ 1))

In [5]:
selic_raw <- get_series(c(selic = 1178), start_date = "2024-01-01")

df_selic <- selic_raw %>%
  mutate(month_year = floor_date(date, "month")) %>%
  group_by(month_year) %>%
  filter(date == max(date)) %>% # Pega a Selic vigente no fim do mês
  ungroup() %>%
  select(month_year, selic_fim_mes = selic) 
  


In [6]:
df_selic <- df_selic %>%
  mutate(
    selic_decimal_ano = selic_fim_mes / 100,
    selic_mensal_decimal = (1 + selic_decimal_ano)^(1/12) - 1
  )

In [7]:
df_selic = df_selic %>% select(month_year,selic_mensal_decimal)

In [8]:
base_prazo_filled = base_prazo_filled %>% left_join(.,df_selic,by=c("month_year"))  

In [9]:
base_prazo_filled = base_prazo_filled %>%
  mutate(
    grupo_taxa = case_when(
      is.na(taxa_scr) ~ NA_character_,
      taxa_scr <= 0.035 ~ '1',
      taxa_scr > 0.035 & taxa_scr <= 0.05 ~ '2',
      taxa_scr > 0.05 & taxa_scr <= 0.065 ~ '3',
      taxa_scr > 0.065 ~ '4'      
    )
  )

base_prazo_filled <- base_prazo_filled %>%
  mutate(
    label_taxa = as.integer(grupo_taxa) - 1L)

base_prazo_filled$grupo_taxa <- factor(base_prazo_filled$grupo_taxa, levels = as.character(1:7))

base_prazo_filled$label_taxa <- ifelse(
  is.na(base_prazo_filled$grupo_taxa),
  NA_integer_,
  as.integer(base_prazo_filled$grupo_taxa) - 1L
)


In [10]:
bancos <- c('Banco Inter', 'Banco do Brasil', 'Bradesco', 'C6 Bank', 
            'Caixa', 'Cooperativas', 'Itaú', 'Mercado Pago', 
            'Nubank', 'Pagseguro', 'Santander')

lista_bancos_tx <- vector("list", length(bancos))
names(lista_bancos_tx) <- bancos 

for (nome_banco in bancos){

    print(nome_banco)
    base_train_tx = base_prazo_filled %>% filter(IF_adj == nome_banco,!is.na(taxa_scr)) 
    base_pred_tx = base_prazo_filled %>% filter(IF_adj == nome_banco,is.na(taxa_scr))

    idx_tx <- sample(seq_len(nrow(base_train_tx)), size = floor(0.8*nrow(base_train_tx)))
    train_df_tx <- base_train_tx[idx_tx, ]
    valid_df_tx <- base_train_tx[-idx_tx, ]
    
    fml_tx <- ~ FullDebt_3M + factor(month_year) + Cont_Socios + razao_debt_tpv + desembolso_scr + factor(label_prazo) + selic_mensal_decimal 
    
    X_train_tx <- sparse.model.matrix(fml_tx, data = train_df_tx)[, -1]
    X_valid_tx <- sparse.model.matrix(fml_tx, data = valid_df_tx)[, -1]
    
    y_train_tx <- train_df_tx$label_taxa
    y_valid_tx <- valid_df_tx$label_taxa
    
    dtrain_tx <- xgb.DMatrix(data = X_train_tx, label = y_train_tx)
    dvalid_tx <- xgb.DMatrix(data = X_valid_tx, label = y_valid_tx)

    params_tx <- list(
    objective = "multi:softmax",
    eval_metric = "merror",
    max_depth = 4,
    min_child_weight = 3,
    eta = 0.8,
    subsample = 0.8,
    colsample_bytree = 0.8,
    num_class = 4,
    seed = 1234)
    
    bst_tx <- xgb.train(
    params = params_tx,
    data = dtrain_tx,
    nrounds = 800,
    watchlist = list(train = dtrain_tx, valid = dvalid_tx),
    early_stopping_rounds = 10,
    verbose = 2)
    
    pred_test <- predict(bst_tx, dvalid_tx)
    print(as.data.frame(cbind(pred_test,y_valid_tx)) %>% mutate(acerto = if_else(pred_test == y_valid_tx,1,0)) %>% summarise(acertos = sum(acerto)/n()))
    
    feature_names_tx <- colnames(X_train_tx)
    factor_vars_tx <- c("month_year")

    # 1) garante factor no treino
    for (v in factor_vars_tx) {
      train_df_tx[[v]] <- as.factor(train_df_tx[[v]])
    }

    # 2) captura níveis
    train_levels_tx <- lapply(train_df_tx[, factor_vars_tx, drop = FALSE], levels)
    
    for (v in names(train_levels_tx)) {
      base_pred_tx[[v]] <- factor(base_pred_tx[[v]], levels = train_levels_tx[[v]])
    }

    base_pred_tx <- base_pred_tx %>% 
      tidyr::drop_na(FullDebt_3M, desembolso_scr, month_year, 
        Cont_Socios, razao_debt_tpv
      )

    X_pred_tx <- sparse.model.matrix(fml_tx, data = base_pred_tx)[, -1]

    # Adiciona colunas faltantes (que existiam no treino mas não apareceram no novo)
    missing_cols <- setdiff(feature_names_tx, colnames(X_pred_tx))
    if (length(missing_cols) > 0) {
      add_mat <- Matrix::Matrix(0, nrow = nrow(X_pred_tx), ncol = length(missing_cols), sparse = TRUE)
      colnames(add_mat) <- missing_cols
      X_pred_tx <- cbind(X_pred_tx, add_mat)
    }

    # Remove colunas extras (apareceram no novo mas não existem no treino)
    extra_cols <- setdiff(colnames(X_pred_tx), feature_names_tx)
    if (length(extra_cols) > 0) {
      X_pred_tx <- X_pred_tx[, setdiff(colnames(X_pred_tx), extra_cols), drop = FALSE]
    }

    # Reordena exatamente como no treino
    X_pred_tx <- X_pred_tx[, feature_names_tx, drop = FALSE]

    dpred_tx <- xgb.DMatrix(X_pred_tx)

    pred_tx <- predict(bst_tx, dpred_tx)

    base_pred_tx$taxa_scr = round(pred_tx)
    
    lista_bancos_tx[[nome_banco]] <- rbind(base_pred_tx,base_train_tx)
    
}

[1] "Banco Inter"




Multiple eval metrics are present. Will use valid_merror for early stopping.
Will train until valid_merror hasn't improved in 10 rounds.

[1]	train-merror:0.320423	valid-merror:0.535211 
[2]	train-merror:0.267606	valid-merror:0.619718 
[3]	train-merror:0.242958	valid-merror:0.619718 
[4]	train-merror:0.211268	valid-merror:0.605634 
[5]	train-merror:0.176056	valid-merror:0.619718 
[6]	train-merror:0.154930	valid-merror:0.619718 
[7]	train-merror:0.154930	valid-merror:0.619718 
[8]	train-merror:0.137324	valid-merror:0.619718 
[9]	train-merror:0.140845	valid-merror:0.619718 
[10]	train-merror:0.126761	valid-merror:0.619718 
Stopping. Best iteration:
[11]	train-merror:0.098592	valid-merror:0.619718

[11]	train-merror:0.098592	valid-merror:0.619718 
    acertos
1 0.4647887
[1] "Banco do Brasil"




Multiple eval metrics are present. Will use valid_merror for early stopping.
Will train until valid_merror hasn't improved in 10 rounds.

[1]	train-merror:0.352507	valid-merror:0.388235 
[2]	train-merror:0.317109	valid-merror:0.376471 
[3]	train-merror:0.297935	valid-merror:0.388235 
[4]	train-merror:0.261062	valid-merror:0.382353 
[5]	train-merror:0.255162	valid-merror:0.388235 
[6]	train-merror:0.240413	valid-merror:0.382353 
[7]	train-merror:0.221239	valid-merror:0.382353 
[8]	train-merror:0.216814	valid-merror:0.370588 
[9]	train-merror:0.203540	valid-merror:0.388235 
[10]	train-merror:0.185841	valid-merror:0.382353 
[11]	train-merror:0.172566	valid-merror:0.382353 
[12]	train-merror:0.159292	valid-merror:0.382353 
[13]	train-merror:0.154867	valid-merror:0.376471 
[14]	train-merror:0.134218	valid-merror:0.370588 
[15]	train-merror:0.135693	valid-merror:0.364706 
[16]	train-merror:0.129794	valid-merror:0.382353 
[17]	train-merror:0.115044	valid-merror:0.394118 
[18]	train-merror:0.1



Multiple eval metrics are present. Will use valid_merror for early stopping.
Will train until valid_merror hasn't improved in 10 rounds.

[1]	train-merror:0.252652	valid-merror:0.292308 
[2]	train-merror:0.222758	valid-merror:0.273077 
[3]	train-merror:0.221794	valid-merror:0.261538 
[4]	train-merror:0.215043	valid-merror:0.257692 
[5]	train-merror:0.206365	valid-merror:0.261538 
[6]	train-merror:0.196721	valid-merror:0.257692 
[7]	train-merror:0.189007	valid-merror:0.257692 
[8]	train-merror:0.184185	valid-merror:0.257692 
[9]	train-merror:0.177435	valid-merror:0.261538 
[10]	train-merror:0.172613	valid-merror:0.261538 
[11]	train-merror:0.171649	valid-merror:0.269231 
[12]	train-merror:0.169720	valid-merror:0.269231 
[13]	train-merror:0.165863	valid-merror:0.276923 
Stopping. Best iteration:
[14]	train-merror:0.149470	valid-merror:0.284615

[14]	train-merror:0.149470	valid-merror:0.284615 
    acertos
1 0.7423077
[1] "C6 Bank"




Multiple eval metrics are present. Will use valid_merror for early stopping.
Will train until valid_merror hasn't improved in 10 rounds.

[1]	train-merror:0.321285	valid-merror:0.523810 
[2]	train-merror:0.277108	valid-merror:0.507937 
[3]	train-merror:0.220884	valid-merror:0.539683 
[4]	train-merror:0.224900	valid-merror:0.571429 
[5]	train-merror:0.208835	valid-merror:0.539683 
[6]	train-merror:0.168675	valid-merror:0.539683 
[7]	train-merror:0.136546	valid-merror:0.523810 
[8]	train-merror:0.128514	valid-merror:0.523810 
[9]	train-merror:0.124498	valid-merror:0.492063 
[10]	train-merror:0.112450	valid-merror:0.523810 
[11]	train-merror:0.092369	valid-merror:0.507937 
[12]	train-merror:0.080321	valid-merror:0.476190 
[13]	train-merror:0.080321	valid-merror:0.507937 
[14]	train-merror:0.060241	valid-merror:0.507937 
[15]	train-merror:0.064257	valid-merror:0.507937 
[16]	train-merror:0.052209	valid-merror:0.507937 
[17]	train-merror:0.044177	valid-merror:0.492063 
[18]	train-merror:0.0



Multiple eval metrics are present. Will use valid_merror for early stopping.
Will train until valid_merror hasn't improved in 10 rounds.

[1]	train-merror:0.138921	valid-merror:0.192661 
[2]	train-merror:0.123995	valid-merror:0.192661 
[3]	train-merror:0.118255	valid-merror:0.188073 
[4]	train-merror:0.115959	valid-merror:0.192661 
[5]	train-merror:0.107922	valid-merror:0.188073 
[6]	train-merror:0.103330	valid-merror:0.188073 
[7]	train-merror:0.097589	valid-merror:0.183486 
[8]	train-merror:0.095293	valid-merror:0.188073 
[9]	train-merror:0.084960	valid-merror:0.188073 
[10]	train-merror:0.084960	valid-merror:0.188073 
[11]	train-merror:0.082664	valid-merror:0.192661 
[12]	train-merror:0.079219	valid-merror:0.192661 
[13]	train-merror:0.076923	valid-merror:0.197248 
[14]	train-merror:0.065442	valid-merror:0.211009 
[15]	train-merror:0.057405	valid-merror:0.211009 
[16]	train-merror:0.060850	valid-merror:0.211009 
Stopping. Best iteration:
[17]	train-merror:0.050517	valid-merror:0.206



Multiple eval metrics are present. Will use valid_merror for early stopping.
Will train until valid_merror hasn't improved in 10 rounds.

[1]	train-merror:0.353468	valid-merror:0.384615 
[2]	train-merror:0.350419	valid-merror:0.374714 
[3]	train-merror:0.348514	valid-merror:0.374714 
[4]	train-merror:0.343559	valid-merror:0.377761 
[5]	train-merror:0.341654	valid-merror:0.380807 
[6]	train-merror:0.341654	valid-merror:0.379284 
[7]	train-merror:0.335747	valid-merror:0.384615 
[8]	train-merror:0.333079	valid-merror:0.385377 
[9]	train-merror:0.333079	valid-merror:0.389947 
[10]	train-merror:0.329649	valid-merror:0.386900 
[11]	train-merror:0.326791	valid-merror:0.387662 
Stopping. Best iteration:
[12]	train-merror:0.322980	valid-merror:0.387662

[12]	train-merror:0.322980	valid-merror:0.387662 
    acertos
1 0.6252856
[1] "Itaú"




Multiple eval metrics are present. Will use valid_merror for early stopping.
Will train until valid_merror hasn't improved in 10 rounds.

[1]	train-merror:0.230275	valid-merror:0.271062 
[2]	train-merror:0.205505	valid-merror:0.267399 
[3]	train-merror:0.183486	valid-merror:0.256410 
[4]	train-merror:0.172477	valid-merror:0.245421 
[5]	train-merror:0.158716	valid-merror:0.260073 
[6]	train-merror:0.149541	valid-merror:0.238095 
[7]	train-merror:0.144037	valid-merror:0.238095 
[8]	train-merror:0.139450	valid-merror:0.241758 
[9]	train-merror:0.129358	valid-merror:0.238095 
[10]	train-merror:0.120183	valid-merror:0.256410 
[11]	train-merror:0.112844	valid-merror:0.241758 
[12]	train-merror:0.111009	valid-merror:0.260073 
[13]	train-merror:0.103670	valid-merror:0.249084 
[14]	train-merror:0.102752	valid-merror:0.245421 
[15]	train-merror:0.087156	valid-merror:0.256410 
Stopping. Best iteration:
[16]	train-merror:0.080734	valid-merror:0.252747

[16]	train-merror:0.080734	valid-merror:0.252



Multiple eval metrics are present. Will use valid_merror for early stopping.
Will train until valid_merror hasn't improved in 10 rounds.

[1]	train-merror:0.482759	valid-merror:0.702703 
[2]	train-merror:0.365517	valid-merror:0.810811 
[3]	train-merror:0.331034	valid-merror:0.837838 
[4]	train-merror:0.296552	valid-merror:0.810811 
[5]	train-merror:0.241379	valid-merror:0.810811 
[6]	train-merror:0.241379	valid-merror:0.783784 
[7]	train-merror:0.213793	valid-merror:0.810811 
[8]	train-merror:0.213793	valid-merror:0.810811 
[9]	train-merror:0.220690	valid-merror:0.810811 
[10]	train-merror:0.172414	valid-merror:0.810811 
Stopping. Best iteration:
[11]	train-merror:0.165517	valid-merror:0.783784

[11]	train-merror:0.165517	valid-merror:0.783784 
    acertos
1 0.2972973
[1] "Nubank"




Multiple eval metrics are present. Will use valid_merror for early stopping.
Will train until valid_merror hasn't improved in 10 rounds.

[1]	train-merror:0.417485	valid-merror:0.428291 
[2]	train-merror:0.399804	valid-merror:0.406680 
[3]	train-merror:0.394892	valid-merror:0.418468 
[4]	train-merror:0.386542	valid-merror:0.414538 
[5]	train-merror:0.376473	valid-merror:0.416503 
[6]	train-merror:0.374754	valid-merror:0.410609 
[7]	train-merror:0.367878	valid-merror:0.416503 
[8]	train-merror:0.360511	valid-merror:0.413556 
[9]	train-merror:0.360511	valid-merror:0.413556 
[10]	train-merror:0.351424	valid-merror:0.407662 
[11]	train-merror:0.345530	valid-merror:0.405697 
[12]	train-merror:0.340128	valid-merror:0.407662 
[13]	train-merror:0.334971	valid-merror:0.410609 
[14]	train-merror:0.333743	valid-merror:0.421415 
[15]	train-merror:0.327849	valid-merror:0.422397 
[16]	train-merror:0.323428	valid-merror:0.417485 
[17]	train-merror:0.320727	valid-merror:0.431238 
[18]	train-merror:0.3



Multiple eval metrics are present. Will use valid_merror for early stopping.
Will train until valid_merror hasn't improved in 10 rounds.

[1]	train-merror:0.355422	valid-merror:0.595238 
[2]	train-merror:0.313253	valid-merror:0.785714 
[3]	train-merror:0.259036	valid-merror:0.809524 
[4]	train-merror:0.204819	valid-merror:0.809524 
[5]	train-merror:0.192771	valid-merror:0.809524 
[6]	train-merror:0.174699	valid-merror:0.738095 
[7]	train-merror:0.144578	valid-merror:0.738095 
[8]	train-merror:0.126506	valid-merror:0.785714 
[9]	train-merror:0.102410	valid-merror:0.761905 
[10]	train-merror:0.084337	valid-merror:0.785714 
Stopping. Best iteration:
[11]	train-merror:0.084337	valid-merror:0.857143

[11]	train-merror:0.084337	valid-merror:0.857143 
    acertos
1 0.4047619
[1] "Santander"




Multiple eval metrics are present. Will use valid_merror for early stopping.
Will train until valid_merror hasn't improved in 10 rounds.

[1]	train-merror:0.352291	valid-merror:0.440252 
[2]	train-merror:0.315956	valid-merror:0.421384 
[3]	train-merror:0.289100	valid-merror:0.459119 
[4]	train-merror:0.271722	valid-merror:0.427673 
[5]	train-merror:0.252765	valid-merror:0.408805 
[6]	train-merror:0.227488	valid-merror:0.446541 
[7]	train-merror:0.225908	valid-merror:0.440252 
[8]	train-merror:0.219589	valid-merror:0.396226 
[9]	train-merror:0.203791	valid-merror:0.396226 
[10]	train-merror:0.192733	valid-merror:0.421384 
[11]	train-merror:0.181675	valid-merror:0.421384 
[12]	train-merror:0.181675	valid-merror:0.452830 
[13]	train-merror:0.167457	valid-merror:0.465409 
[14]	train-merror:0.153239	valid-merror:0.440252 
[15]	train-merror:0.151659	valid-merror:0.446541 
[16]	train-merror:0.151659	valid-merror:0.465409 
[17]	train-merror:0.139021	valid-merror:0.440252 
Stopping. Best iterat

In [11]:
base_tx_filled = bind_rows(lista_bancos_tx)

In [12]:
base_tx_filled = base_tx_filled %>% mutate(grupo_tx_scr = case_when(is.na(label_taxa) ~ taxa_scr,
                                             !is.na(label_taxa) ~ label_taxa)) 

In [13]:
base_tx_filled_stone = base_tx_filled %>% filter(!is.na(limite_stone)) %>% mutate(grupo_tx_stone = case_when(taxa_stone > 0 & taxa_stone <= 0.0349 ~ 0,
                                                                                                             taxa_stone > 0.0349 & taxa_stone <= 0.0499 ~ 1,
                                                                                                             taxa_stone > 0.0499 & taxa_stone <= 0.0649 ~ 2,
                                                                                                             taxa_stone > 0.0649 ~ 3)) 

In [14]:
base_tx_filled_stone = base_tx_filled_stone %>%
  mutate(
    grupo_prazo_stone = case_when(
      is.na(prazo_stone) ~ NA_character_,
      prazo_stone <= 12 ~ '1',
      prazo_stone > 12 & prazo_stone <= 24 ~ '2',
      prazo_stone > 24 ~ '3'      
    )
  )

base_tx_filled_stone <- base_tx_filled_stone %>%
  mutate(
    label_prazo_stone = as.integer(grupo_prazo_stone) - 1L)

base_tx_filled_stone$grupo_prazo_stone <- factor(base_tx_filled_stone$grupo_prazo_stone, levels = as.character(1:3))

base_tx_filled_stone$label_prazo_stone <- ifelse(
  is.na(base_tx_filled_stone$grupo_prazo_stone),
  NA_integer_,
  as.integer(base_tx_filled_stone$grupo_prazo_stone) - 1L
)


In [16]:
# Base com os desembolsos das IFs de Alta Penetração de clientes que estavam ativos e cientes da oferta Stone

base_tx_filled_stone_ = base_tx_filled_stone %>% select('IF_adj','RootDocumentNumber','month_year','flag_desembolsou_stone','label_prazo','label_prazo_stone','desembolso_scr','limite_stone','grupo_tx_stone','prazo_scr','taxa_scr','taxa_stone','IsAcquirerActive','flag_interacao','flag_operacao','grupo_tx_scr','diferenca_prazo','diferenca_limite','flag_filled') %>% filter(flag_interacao == TRUE,flag_operacao=='automatica') %>% filter(IF_adj %in% c('Banco Inter','Pagseguro','Santander','C6 Bank','Nubank','Mercado Pago')) 

In [17]:
base_orig = base_tx_filled_stone_ %>% filter(flag_filled == 0)
base_tx_filled_stone__ = base_tx_filled_stone_ %>% filter(flag_filled == 1)

In [94]:
base_tx_filled_stone__  = base_tx_filled_stone__ %>% mutate(limite_stone_melhor = if_else(limite_stone > desembolso_scr,1,0),prazo_stone_melhor = if_else(label_prazo_stone > label_prazo,1,0),taxa_stone_melhor = if_else(grupo_tx_stone < grupo_tx_scr,1,0)) %>% mutate(qtd_stone_melhor = limite_stone_melhor+prazo_stone_melhor+taxa_stone_melhor)

In [146]:
base_tx_filled_stone__ %>% filter(qtd_stone_melhor == 1) %>% summarise(total_desembolso_scr = sum(desembolso_scr),n=n()
                                                                )

total_desembolso_scr,n
<dbl>,<int>
27454307,463


In [20]:
base_tx_filled_stone__ %>% filter(qtd_stone_melhor == 2) %>% summarise(total_desembolso_scr = sum(desembolso_scr))

total_desembolso_scr
<dbl>
1059552


In [21]:
base_tx_filled_stone__ %>% filter(qtd_stone_melhor == 3) %>% summarise(total_desembolso_scr = sum(desembolso_scr))

total_desembolso_scr
<dbl>
499529.5


In [143]:
base_tx_filled_stone__ %>% filter(qtd_stone_melhor == 0) %>% summarise(total_desembolso_scr = sum(desembolso_scr),n=n())

total_desembolso_scr,n
<dbl>,<int>
62762943,512


In [145]:
512+57

In [23]:
base_orig = base_orig %>% mutate(limite_stone_melhor = if_else(limite_stone > desembolso_scr,1,0),prazo_stone_melhor = if_else(label_prazo_stone > label_prazo,1,0),taxa_stone_melhor = if_else(grupo_tx_stone < grupo_tx_scr,1,0)) %>% mutate(qtd_stone_melhor = limite_stone_melhor+prazo_stone_melhor+taxa_stone_melhor)

In [24]:
base_orig %>% filter(qtd_stone_melhor == 1) %>% summarise(total_desembolso_scr = sum(desembolso_scr))

total_desembolso_scr
<dbl>
1983783


In [25]:
base_orig %>% filter(qtd_stone_melhor == 2) %>% summarise(total_desembolso_scr = sum(desembolso_scr))

total_desembolso_scr
<dbl>
957250.6


In [26]:
base_orig %>% filter(qtd_stone_melhor == 3) %>% summarise(total_desembolso_scr = sum(desembolso_scr))

total_desembolso_scr
<dbl>
94083.84


In [144]:
base_orig %>% filter(qtd_stone_melhor == 0) %>% summarise(total_desembolso_scr = sum(desembolso_scr),n=n())

total_desembolso_scr,n
<dbl>,<int>
3782102,57


In [45]:
# soma condição_melhor == 1
(27454307 + 1983783)/98593550.94

In [48]:
# soma condição_melhor == 2
(1059552 + 957250.6)/98593550.94

In [47]:
# soma condição_melhor == 3
(499529.5 + 94083.84)/98593550.94

In [46]:
# soma condição_melhor == 0
(62762943 + 3782102)/98593550.94

In [142]:
# total calibrado
29438090+2016802.6+593613.34+66545045

In [61]:
v1 = base_tx_filled_stone__ %>% filter(qtd_stone_melhor == 0) %>% mutate(razao_limite = abs(diferenca_limite)/desembolso_scr) %>% filter((razao_limite <= 0.1)) %>% summarise(soma_des = sum(desembolso_scr)) %>% pull(soma_des)
v2 = base_orig %>% filter(qtd_stone_melhor == 0) %>% mutate(razao_limite = abs(diferenca_limite)/desembolso_scr) %>% filter((razao_limite <= 0.1)) %>% summarise(soma_des = sum(desembolso_scr)) %>% pull(soma_des)

print(v1+v2)

[1] 4465327


In [64]:
v1 = base_tx_filled_stone__ %>% filter(qtd_stone_melhor == 0) %>% filter(grupo_tx_stone==2,grupo_tx_scr>=2) %>% summarise(soma_des = sum(desembolso_scr)) %>% pull(soma_des)
v2 = base_orig %>% filter(qtd_stone_melhor == 0) %>% filter(grupo_tx_stone==2,grupo_tx_scr>=2) %>% summarise(soma_des = sum(desembolso_scr)) %>% pull(soma_des)

print(v1+v2)

[1] 103589.9


In [None]:
v1 = base_tx_filled_stone__ %>% filter(qtd_stone_melhor == 0) %>% filter(grupo_tx_stone==2,grupo_tx_scr>=2) %>% summarise(soma_des = sum(desembolso_scr)) %>% pull(soma_des)
v2 = base_orig %>% filter(qtd_stone_melhor == 0) %>% filter(grupo_tx_stone==2,grupo_tx_scr>=2) %>% summarise(soma_des = sum(desembolso_scr)) %>% pull(soma_des)

print(v1+v2)

In [28]:
# Base com os desembolsos das IFs de Baixa Penetração de clientes que estavam ativos e cientes da oferta Stone

base_tx_filled_stone_baixapen = base_tx_filled_stone %>% select('IF_adj','RootDocumentNumber','month_year','label_prazo','flag_desembolsou_stone','desembolso_scr','label_prazo','label_prazo_stone','limite_stone','grupo_tx_stone','prazo_stone','prazo_scr','taxa_scr','taxa_stone','IsAcquirerActive','flag_interacao','flag_operacao','grupo_tx_scr','diferenca_prazo','diferenca_limite','flag_filled') %>% filter(flag_interacao == TRUE,flag_operacao=='automatica') %>% filter(IF_adj %in% c('Banco do Brasil','Bradesco','Caixa','Cooperativas','Itaú','Outros'))

In [29]:
base_orig_baixapen = base_tx_filled_stone_baixapen %>% filter(flag_filled == 0)
base_tx_filled_stone__baixapen = base_tx_filled_stone_baixapen %>% filter(flag_filled == 1)

In [121]:
base_tx_filled_stone__baixapen_ = base_tx_filled_stone__baixapen %>% mutate(limite_stone_melhor = if_else(limite_stone > desembolso_scr,1,0),prazo_stone_melhor = if_else(label_prazo_stone > label_prazo,1,0),taxa_stone_melhor = if_else(grupo_tx_stone < grupo_tx_scr,1,0)) %>% mutate(qtd_stone_melhor = limite_stone_melhor+prazo_stone_melhor+taxa_stone_melhor)

In [155]:
base_tx_filled_stone__baixapen_ %>% filter(qtd_stone_melhor == 1) %>% summarise(total_desembolso_scr = sum(desembolso_scr),n=n())

total_desembolso_scr,n
<dbl>,<int>
323105877,4531


In [154]:
base_tx_filled_stone__baixapen_ %>% filter(qtd_stone_melhor == 2) %>% summarise(total_desembolso_scr = sum(desembolso_scr),n=n())

total_desembolso_scr,n
<dbl>,<int>
11359033,689


In [153]:
base_tx_filled_stone__baixapen_ %>% filter(qtd_stone_melhor == 3) %>% summarise(total_desembolso_scr = sum(desembolso_scr),n=n())

total_desembolso_scr,n
<dbl>,<int>
592478.6,79


In [152]:
base_tx_filled_stone__baixapen_ %>% filter(qtd_stone_melhor == 0) %>% summarise(total_desembolso_scr = sum(desembolso_scr),n=n())

total_desembolso_scr,n
<dbl>,<int>
795363904,5646


In [151]:
5646+296

In [35]:
base_orig_baixapen = base_orig_baixapen %>% mutate(limite_stone_melhor = if_else(limite_stone > desembolso_scr,1,0),prazo_stone_melhor = if_else(label_prazo_stone > label_prazo,1,0),taxa_stone_melhor = if_else(taxa_stone < taxa_scr,1,0)) %>% mutate(qtd_stone_melhor = limite_stone_melhor+prazo_stone_melhor+taxa_stone_melhor)

In [None]:
base_orig_baixapen %>% filter(qtd_stone_melhor == 1) %>% summarise(total_desembolso_scr = sum(desembolso_scr),n=n())

In [None]:
base_orig_baixapen %>% filter(qtd_stone_melhor == 2) %>% summarise(total_desembolso_scr = sum(desembolso_scr),n=n())

In [None]:
base_orig_baixapen %>% filter(qtd_stone_melhor == 3) %>% summarise(total_desembolso_scr = sum(desembolso_scr),n=n())

In [149]:
base_orig_baixapen %>% filter(qtd_stone_melhor == 0) %>% summarise(total_desembolso_scr = sum(desembolso_scr),n=n())

total_desembolso_scr,n
<dbl>,<int>
41748389,296


In [50]:
# soma condição_melhor == 1
(323105877 + 23773735)/1213315230.6

In [51]:
# soma condição_melhor == 2
(11359033 + 8685907)/1213315230.6

In [52]:
# soma condição_melhor == 3
(592478.6 + 8685907)/1213315230.6

In [53]:
# soma condição_melhor == 0
(795363904 + 41748389)/1213315230.6

In [49]:
# total calibrado
(323105877 + 23773735)+(11359033 + 8685907)+(592478.6 + 8685907)+(795363904 + 41748389)

In [None]:
base_tx_filled_stone_ %>% filter(qtd_stone_melhor == 0) 

In [112]:
base_tx_filled_stone_ %>% filter(qtd_stone_melhor == 0) %>% summarise(soma_des=sum(desembolso_scr))

soma_des
<dbl>
66545045


In [118]:
base_tx_filled_stone_ %>% filter(qtd_stone_melhor == 0) %>% filter(abs(diferenca_limite)/desembolso_scr <= 0.1) %>% summarise(soma_des=sum(desembolso_scr))

soma_des
<dbl>
4465327


In [None]:
base_tx_filled_stone_ %>% filter(qtd_stone_melhor == 0) %>% filter(abs(diferenca_limite)/desembolso_scr <= 0.1) %>% summarise(soma_des=sum(desembolso_scr))

In [124]:
base_tx_filled_stone_ %>% filter(qtd_stone_melhor == 0) %>% filter(label_prazo_stone >= label_prazo) %>% summarise(soma_des=sum(desembolso_scr))

soma_des
<dbl>
2407209


In [148]:
base_tx_filled_stone_baixapen %>% filter(qtd_stone_melhor == 0) %>% filter(abs(diferenca_limite)/desembolso_scr <= 0.1) %>% summarise(soma_des=sum(desembolso_scr),n=n())

soma_des,n
<dbl>,<int>
45739135,408


In [97]:
base_tx_filled_stone_ %>% filter(qtd_stone_melhor == 0) %>% group_by(IF_adj) %>% summarise(soma_des=sum(desembolso_scr)) %>% arrange(desc(soma_des)) 

IF_adj,soma_des
<chr>,<dbl>
Santander,49596841.6
Banco Inter,6773105.4
Nubank,5913835.3
C6 Bank,2371084.2
Pagseguro,1608154.6
Mercado Pago,282023.9


In [78]:
base_tx_filled_stone_baixapen %>% group_by(IF_adj) %>% summarise(soma_des=sum(desembolso_scr)) %>% arrange(desc(soma_des))

IF_adj,soma_des
<chr>,<dbl>
Cooperativas,436290643
Itaú,239198185
Bradesco,218422253
Banco do Brasil,185179277
Caixa,126593218
