In [None]:
library(readr)
library(MASS)
library(dplyr)
library(xgboost)
library(Matrix)

set.seed(1234)

In [None]:
base = read_csv('base_concorrencia_bq.csv')

In [None]:
colnames(base)

In [None]:
base = base %>% filter(IF_adj=='Santander',prazo_scr <= 60, !is.na(mediana_tpv_3m), mediana_tpv_3m > 1) %>% mutate(grupo_cnae = substr(cnae_fiscal_principal,1,3),std_dev_debt_3m = tidyr::replace_na(std_dev_debt_3m,0),FullDebt_3M = tidyr::replace_na(FullDebt_3M,0),IsAcquirerActive = tidyr::replace_na(IsAcquirerActive,FALSE)) %>% mutate(FullDebt_3M_2=FullDebt_3M**2,mediana_tpv_3m_2=mediana_tpv_3m**2,desembolso_scr_2=desembolso_scr**2)

In [None]:
base_train = base %>% filter(!is.na(prazo_scr))

In [None]:
idx <- sample(seq_len(nrow(base_train)), size = floor(0.9*nrow(base_train)))
train_df <- base_train[idx, ]
valid_df <- base_train[-idx, ]

In [None]:
fml <- ~ factor(IsAcquirerActive) + factor(Default_Month_Prior) + log(std_dev_debt_3m+1) + log(FullDebt_3M+1) + log(FullDebt_3M_2+1) + log(mediana_tpv_3m) + log(mediana_tpv_3m_2) + log(desembolso_scr) + log(desembolso_scr_2) + factor(month_year) + n_if +
  factor(ano_inicio_atividade) + factor(uf) +
  Cont_Socios + factor(grupo_cnae)

In [None]:
X_train <- sparse.model.matrix(fml, data = train_df)[, -1]
X_valid <- sparse.model.matrix(fml, data = valid_df)[, -1]

In [None]:
y_train <- train_df$prazo_scr
y_valid <- valid_df$prazo_scr

In [None]:
y_train_log <- log1p(y_train)
y_valid_log <- log1p(y_valid)

dtrain <- xgb.DMatrix(X_train, label = y_train_log)
dvalid <- xgb.DMatrix(X_valid, label = y_valid_log)

In [None]:
params <- list(
  objective = "reg:absoluteerror",
  eval_metric = "mae",
  max_depth = 5,
  min_child_weight = 5,
  eta = 0.05,
  subsample = 0.8,
  colsample_bytree = 0.8)

In [None]:
bst <- xgb.train(
  params = params,
  data = dtrain,
  nrounds = 800,
  watchlist = list(train = dtrain, valid = dvalid),
  early_stopping_rounds = 30,
  verbose = 2
)

In [None]:
pred_log <- predict(bst, dvalid)
pred <- expm1(pred_log)

mae <- mean(abs(pred - y_valid))
mae

#11.9717065181756

In [None]:
imp <- xgb.importance(model = bst)
xgb.plot.importance(imp, top_n = 20)

In [None]:
breaks <- seq(0, 60, by = 10)

bins <- cut(
  y_valid,
  breaks = breaks,
  right = FALSE,
  include.lowest = TRUE
)

tab <- table(bins)
tab_perc <- prop.table(tab)

res <- data.frame(
  faixa = names(tab),
  n = as.integer(tab),
  perc = round(100 * as.numeric(tab_perc), 1)
)

In [None]:
df_bins <- aggregate(
  y_valid,
  by = list(bins),
  FUN = length
)

colnames(df_bins) <- c("faixa", "n")
df_bins$perc <- 100 * df_bins$n / sum(df_bins$n)

# Extrair limites das faixas
get_mid <- function(faixa) {
  as.numeric(sub("\\[|\\(|,.*", "", faixa)) +
    (as.numeric(sub(".*,", "", sub("\\)", "", faixa))) -
     as.numeric(sub("\\[|\\(|,.*", "", faixa))) / 2
}


In [None]:
plot(
  y_valid, pred,
  xlab = "Prazo real (meses)",
  ylab = "Prazo previsto (meses)",
  main = "Predito vs Real (com distribuição do prazo real)",
  pch = 16, col = rgb(0, 0, 0, 0.3)
)

abline(0, 1, col = "red", lwd = 2)

# Linhas verticais das faixas
abline(v = breaks, col = "grey80", lty = 3)

# Anotar percentuais no topo
y_top <- max(pred, na.rm = TRUE) * 1.02

for (i in seq_along(tab)) {
  if (tab[i] > 0) {
    x_mid <- (breaks[i] + breaks[i + 1]) / 2
    text(
      x = x_mid,
      y = y_top,
      labels = paste0(round(100 * tab[i] / sum(tab), 1), "%"),
      cex = 0.8
    )
  }
}
