## Import Libraries

In [1]:
import os
import warnings

import scipy as sp

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
import dateutil.relativedelta

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

import xgboost as xgb

from google.colab import drive

In [2]:
warnings.filterwarnings("ignore")
plt.style.use('ggplot')
%matplotlib inline

In [3]:
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
RANDOM_STATE = 42

## Load Data

In [5]:
DATA_PATH = "gdrive/MyDrive/EASY_MONEY_NUCLIO/data/"

In [6]:
#df = pd.read_pickle(DATA_PATH + "total_df_segmentado.pkl")

In [7]:
df = pd.read_csv(DATA_PATH + "total_df.csv", sep=";", index_col=0)

In [8]:
# corregir fechas
df.loc[df["entry_date"] == "2015-02-29", "entry_date"] = "2015-02-28"
df.loc[df["entry_date"] == "2019-02-29", "entry_date"] = "2019-02-28"

In [9]:
DATES = df["pk_partition"].unique().tolist()

In [10]:
df["pk_partition"] = pd.to_datetime(df["pk_partition"], format="%Y-%m-%d")
df["entry_date"] = pd.to_datetime(df["entry_date"], format="%Y-%m-%d")

In [11]:
PRODUCTS = [
    "short_term_deposit",	
    "loans",
    "mortgage",	
    "funds",
    "securities",
    "long_term_deposit",	
    "em_account_pp",	
    "credit_card",	
    "payroll",	
    "pension_plan",	
    "payroll_account",	
    "emc_account",	
    "debit_card",	
    "em_account_p",	
    "em_acount"	
]

PRICE = {
    "Finaciación": 60,
    "Ahorro/Inversion": 40,
    "Cuentas": 10
}

PRODUCTS_PRICED = {
    "short_term_deposit": "Ahorro/Inversion",	
    "loans": "Finaciación",
    "mortgage": "Finaciación",	
    "funds": "Ahorro/Inversion",
    "securities": "Ahorro/Inversion",
    "long_term_deposit": "Ahorro/Inversion",	
    "em_account_pp": "Cuentas",	
    "credit_card": "Finaciación",
    "pension_plan": "Ahorro/Inversion",	
    "payroll_account": "Cuentas",	
    "emc_account": "Cuentas",	
    "debit_card": "Finaciación",	
    "em_account_p": "Cuentas",	
    "em_acount": "Cuentas"	
}

# EDA

In [12]:
df.isna().sum()

pk_cid                      0
pk_partition                0
short_term_deposit          0
loans                       0
mortgage                    0
funds                       0
securities                  0
long_term_deposit           0
em_account_pp               0
credit_card                 0
payroll                    61
pension_plan               61
payroll_account             0
emc_account                 0
debit_card                  0
em_account_p                0
em_acount                   0
entry_date                  0
entry_channel          133033
active_customer             0
segment                133944
country_id                  0
region_code              2264
gender                     25
age                         0
deceased                    0
salary                1512103
dtype: int64

In [13]:
df[["pk_cid", "pk_partition"]].duplicated().astype(int).sum()

0

# NaN's

In [14]:
df["salary"] = df["salary"].fillna(-9999)

In [15]:
df["region_code"] = df["region_code"].fillna(-99)

In [16]:
df["entry_channel"] = df["entry_channel"].fillna("Unknown")

In [17]:
df["pension_plan"].value_counts(dropna=False)

0.0    5745061
1.0     217802
NaN         61
Name: pension_plan, dtype: int64

In [18]:
df["pension_plan"] = df["pension_plan"].fillna(0.0)

In [19]:
df["gender"].value_counts(dropna=False)

H      3087502
V      2875397
NaN         25
Name: gender, dtype: int64

In [20]:
df["gender"] = df["gender"].fillna(df["gender"].mode().values[0])

In [21]:
df.isna().sum()

pk_cid                     0
pk_partition               0
short_term_deposit         0
loans                      0
mortgage                   0
funds                      0
securities                 0
long_term_deposit          0
em_account_pp              0
credit_card                0
payroll                   61
pension_plan               0
payroll_account            0
emc_account                0
debit_card                 0
em_account_p               0
em_acount                  0
entry_date                 0
entry_channel              0
active_customer            0
segment               133944
country_id                 0
region_code                0
gender                     0
age                        0
deceased                   0
salary                     0
dtype: int64

# Population Analysis

In [22]:
for product in PRODUCTS:
  if product not in ["payroll", "em_account_pp"]:
    print(f"#########{product}########")
    print(df.loc[df[product] == 1, [product, "age", "salary"]].describe())
    print("Predominant gender: ", df.loc[df[product] == 1, "gender"].mode().values[0])
    print("#####################")
    print("")

#########short_term_deposit########
       short_term_deposit           age        salary
count             15394.0  15394.000000  1.539400e+04
mean                  1.0     46.836690  9.187965e+04
std                   0.0     13.849793  1.914814e+05
min                   1.0     20.000000 -9.999000e+03
25%                   1.0     37.000000 -9.999000e+03
50%                   1.0     44.000000  8.017053e+04
75%                   1.0     55.000000  1.269798e+05
max                   1.0     97.000000  9.833873e+06
Predominant gender:  V
#####################

#########loans########
       loans         age         salary
count  468.0  468.000000     468.000000
mean     1.0   33.478632   95258.656923
std      0.0    9.633630   61072.220811
min      1.0   21.000000   -9999.000000
25%      1.0   26.000000   60530.880000
50%      1.0   30.000000   78374.385000
75%      1.0   37.250000  145972.320000
max      1.0   60.000000  228200.220000
Predominant gender:  V
#####################

###

Podemos observar como algunos productos se podrían agrupar debido a que el perfil de edad de los clientes es parecido. La variable "salary" no es un buen indicativo del cliente (se extrayeron los datos a partir de encuestas) para hacer esta agrupación.

In [23]:
for product in PRODUCTS:
  if product != "payroll":
    _df = df.groupby("pk_cid").agg({product: "max"})
    print(f"#######{product}#######")
    print(_df[product].value_counts())
    print(_df[product].value_counts(normalize=True))
    print("########################")
    print("")

del _df

#######short_term_deposit#######
0    451049
1      5324
Name: short_term_deposit, dtype: int64
0    0.988334
1    0.011666
Name: short_term_deposit, dtype: float64
########################

#######loans#######
0    456329
1        44
Name: loans, dtype: int64
0    0.999904
1    0.000096
Name: loans, dtype: float64
########################

#######mortgage#######
0    456348
1        25
Name: mortgage, dtype: int64
0    0.999945
1    0.000055
Name: mortgage, dtype: float64
########################

#######funds#######
0    454707
1      1666
Name: funds, dtype: int64
0    0.996349
1    0.003651
Name: funds, dtype: float64
########################

#######securities#######
0    454284
1      2089
Name: securities, dtype: int64
0    0.995423
1    0.004577
Name: securities, dtype: float64
########################

#######long_term_deposit#######
0    447262
1      9111
Name: long_term_deposit, dtype: int64
0    0.980036
1    0.019964
Name: long_term_deposit, dtype: float64
###############

Podemos ver como la tenencia de los distintos productos no esta balanceada en la gran mayoría de los casos.

También se puede observar como el producto em_account_pp no tiene ninguna venta en todo el histórico disponible.

Un buen enfoque para atacar este problema sería agrupar los productos en distintas categorías en función de lo parecida que sea la población que contrata cada producto, como de balanceado este el target (cada producto) y por productos similares.

In [24]:
cuentas = ["emc_account", "em_account_p", "payroll_account"]
ahorro_inversion = ["short_term_deposit", "long_term_deposit", "pension_plan", "securities", "funds"]
financiacion = ["credit_card", "mortgage", "loans"]

In [25]:
df["cuentas"] = df[cuentas].max(axis=1)
df["ahorro_inversion"] = df[ahorro_inversion].max(axis=1)
df["financiacion"] = df[financiacion].max(axis=1)

for group in [cuentas, ahorro_inversion, financiacion]:
  df = df.drop(group, axis=1)

In [26]:
for product in ["cuentas", "ahorro_inversion", "financiacion", "em_acount", "debit_card"]:
    _df = df.groupby("pk_cid").agg({product: "max"})
    print(f"#######{product}#######")
    print(_df[product].value_counts())
    print(_df[product].value_counts(normalize=True))
    print("########################")
    print("")
  
del _df

#######cuentas#######
0    403699
1     52674
Name: cuentas, dtype: int64
0    0.884581
1    0.115419
Name: cuentas, dtype: float64
########################

#######ahorro_inversion#######
0.0    413955
1.0     42418
Name: ahorro_inversion, dtype: int64
0.0    0.907054
1.0    0.092946
Name: ahorro_inversion, dtype: float64
########################

#######financiacion#######
0    447810
1      8563
Name: financiacion, dtype: int64
0    0.981237
1    0.018763
Name: financiacion, dtype: float64
########################

#######em_acount#######
1    332728
0    123645
Name: em_acount, dtype: int64
1    0.72907
0    0.27093
Name: em_acount, dtype: float64
########################

#######debit_card#######
0    392837
1     63536
Name: debit_card, dtype: int64
0    0.860781
1    0.139219
Name: debit_card, dtype: float64
########################



Podemos ver como todos los nuevos productos estan balanceados excepto los productos de "financiacion". Para esta categoría de productos tendremos que realizar técnicas de rebalanceo.

In [27]:
df.head()

Unnamed: 0,pk_cid,pk_partition,em_account_pp,payroll,debit_card,em_acount,entry_date,entry_channel,active_customer,segment,country_id,region_code,gender,age,deceased,salary,cuentas,ahorro_inversion,financiacion
0,1375586,2018-01-28,0,0.0,0,1,2018-01-12,KHL,1.0,02 - PARTICULARES,ES,29.0,H,35,N,87218.1,0,0.0,0
1,1050611,2018-01-28,0,0.0,0,1,2015-08-10,KHE,0.0,03 - UNIVERSITARIO,ES,13.0,V,23,N,35548.74,0,0.0,0
2,1050612,2018-01-28,0,0.0,0,1,2015-08-10,KHE,0.0,03 - UNIVERSITARIO,ES,13.0,V,23,N,122179.11,0,0.0,0
3,1050613,2018-01-28,0,0.0,0,0,2015-08-10,KHD,0.0,03 - UNIVERSITARIO,ES,50.0,H,22,N,119775.54,0,1.0,0
4,1050614,2018-01-28,0,0.0,0,1,2015-08-10,KHE,1.0,03 - UNIVERSITARIO,ES,50.0,V,23,N,-9999.0,0,0.0,0


# Preprocessing

In [28]:
df["deceased"].value_counts()

N    5961849
S       1075
Name: deceased, dtype: int64

No podemos recomendar productos a clientes que estan muertos, por tanto, filtramos estos clientes.

In [29]:
df = df[df["deceased"] != "S"]

In [30]:
df = df[(df["age"] >= 18) & (df["age"] <= 90)]

In [31]:
# cols_to_drop = ["entry_date", "segment", "deceased", "payroll", "gender", "ConProducto"]
cols_to_drop = ["segment", "deceased", "payroll", "em_account_pp"]
df = df.drop(cols_to_drop, axis=1)

In [32]:
print(df.shape)
print(df.columns)

(5921197, 15)
Index(['pk_cid', 'pk_partition', 'debit_card', 'em_acount', 'entry_date',
       'entry_channel', 'active_customer', 'country_id', 'region_code',
       'gender', 'age', 'salary', 'cuentas', 'ahorro_inversion',
       'financiacion'],
      dtype='object')


In [33]:
df["country_id"] = LabelEncoder().fit_transform(df["country_id"])

In [34]:
df.head()

Unnamed: 0,pk_cid,pk_partition,debit_card,em_acount,entry_date,entry_channel,active_customer,country_id,region_code,gender,age,salary,cuentas,ahorro_inversion,financiacion
0,1375586,2018-01-28,0,1,2018-01-12,KHL,1.0,15,29.0,H,35,87218.1,0,0.0,0
1,1050611,2018-01-28,0,1,2015-08-10,KHE,0.0,15,13.0,V,23,35548.74,0,0.0,0
2,1050612,2018-01-28,0,1,2015-08-10,KHE,0.0,15,13.0,V,23,122179.11,0,0.0,0
3,1050613,2018-01-28,0,0,2015-08-10,KHD,0.0,15,50.0,H,22,119775.54,0,1.0,0
4,1050614,2018-01-28,0,1,2015-08-10,KHE,1.0,15,50.0,V,23,-9999.0,0,0.0,0


In [35]:
df["entry_year"] = df["entry_date"].dt.year
df["entry_month"] = df["entry_date"].dt.month
df["entry_day"] = df["entry_date"].dt.day
df["entry_week"] = df["entry_date"].dt.week
df["entry_weekday"] = df["entry_date"].dt.weekday
df["entry_quarter"] = df["entry_date"].dt.quarter

df = df.drop("entry_date", axis=1)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5921197 entries, 0 to 5962923
Data columns (total 20 columns):
 #   Column            Dtype         
---  ------            -----         
 0   pk_cid            int64         
 1   pk_partition      datetime64[ns]
 2   debit_card        int64         
 3   em_acount         int64         
 4   entry_channel     object        
 5   active_customer   float64       
 6   country_id        int64         
 7   region_code       float64       
 8   gender            object        
 9   age               int64         
 10  salary            float64       
 11  cuentas           int64         
 12  ahorro_inversion  float64       
 13  financiacion      int64         
 14  entry_year        int64         
 15  entry_month       int64         
 16  entry_day         int64         
 17  entry_week        int64         
 18  entry_weekday     int64         
 19  entry_quarter     int64         
dtypes: datetime64[ns](1), float64(4), int64(13), o

In [37]:
def set_others(df, col, n):
    """
    Converts to Categorical column and sets Others for the categories outside the top n
    :param df: DataFrame object
    :param col: column name
    :param n: top X
    :return 
    """
    top_categories = df[col].value_counts().head(n)
    top_categories_list = top_categories.index.tolist()
    top_categories_list.append("Others")

    df[col] = pd.Categorical(df[col], categories=top_categories_list)
    return df[col].fillna("Others")

In [38]:
df["entry_channel"].value_counts().head(8)

KHE        3113575
KFC         873049
KHQ         590224
KAT         405973
KHK         229826
KHM         176333
Unknown     132171
KHN         108157
Name: entry_channel, dtype: int64

In [39]:
df["entry_channel"] = set_others(df, "entry_channel", 8)

In [40]:
def one_hot_encoder(data, col):
    _dummy_dataset = pd.get_dummies(data[col], prefix=col)
    data = pd.concat([data, _dummy_dataset], axis=1)
    del data[col]
    del _dummy_dataset

    return data

In [41]:
df = one_hot_encoder(df, "entry_channel")

In [42]:
# region_code frequency encoding
region_counter = df["region_code"].value_counts()
df["region_code"] = df["region_code"].replace(region_counter)

In [43]:
df["gender"] = (df["gender"] == "H").astype(int)

# Dataset Resampling

Queremos predecir las compras de los productos para el mes de Junio del 2019. Para poder implementar esto correctamente tendremos que coger los datos para "train/test" desde Enero del 2018 hasta Abril del 2019 y nos guardaremos Mayo del 2019 para validación.

Una vez convertidos los datos para que el modelo los entienda, tendremos que preparar el conjunto de datos para que cada row represente a un cliente. 

Para realizar esto, se extraerá la fecha mínima de compra de cada cliente, le restaremos un mes a esa fecha y extraeremos la row correspondiente que contenga dicha fecha para cada cliente. Esto se realiza para conseguir la información de aquellos clientes a mes m1, ya que son los clientes que van a comprar el mes siguiente.

Con tal de que el modelo no aprenda a predecir mejor las compras en aquellos meses donde hay más predominancia de compras vs. no compras, realizaremos un rebalanceo mes a mes. El objetivo es conseguir un ratio de 90-10 de la clase negativa vs. la clase positiva.

In [44]:
TARGETS = [
         "em_acount",
         "debit_card",
         "cuentas",
         "ahorro_inversion",
         "financiacion"
]

In [45]:
def get_compras(df, target):
  compras = df[(df[target] == 1) & (df["pk_partition"] != "2018-01-28")].groupby("pk_cid")["pk_partition"].min().reset_index().set_index("pk_cid")
  compras["mes_m1"] = compras["pk_partition"].dt.date - pd.DateOffset(months=1)
  del compras["pk_partition"]
  compras.columns = ["pk_partition"]
  compras["compra"] = 1

  return compras

def clean_compras(df_compras, df):
  df_compras["pk_cid"] = df_compras.index
  c_cid_partition_tupled = df_compras[["pk_cid", "pk_partition"]].apply(tuple, axis=1)
  df_cid_partition_tupled = df[["pk_cid", "pk_partition"]].apply(tuple, axis=1)

  df_compras = df_compras[c_cid_partition_tupled.isin(df_cid_partition_tupled.tolist())]
  del df_compras["pk_cid"]

  return df_compras

def clean_no_compras(df_compras, df_no_compras):
  intersection = set(df_compras.index).intersection(df_no_compras.index.tolist())
  df_no_compras = df_no_compras.drop(intersection, axis=0)

  return df_no_compras

def get_month_subset(df_compras, df_no_compras, month):
  compras = df_compras[df_compras["pk_partition"] == month]
  no_compras = df_no_compras[df_no_compras["pk_partition"] <= month] 
  # miramos que los clientes que estan en compras no esten en no compras
  no_compras_cleaned = no_compras[~ no_compras.index.isin(compras.index.tolist())]

  return compras, no_compras_cleaned

Recorreremos cada target individualmente entre las fechas 2018-02 y 2019-04. Hay que tener en cuenta que estamos recorriendo para mes m1.

In [46]:
datasets = {}
for target in TARGETS:
  compras = get_compras(df, target)
  compras = clean_compras(compras, df)
  pool_no_compras = df[df["em_acount"] == 0].groupby("pk_cid")["pk_partition"].min().reset_index().set_index("pk_cid")
  pool_no_compras = clean_no_compras(compras, pool_no_compras)
  pool_no_compras["compra"] = 0
  dataset_target = pd.DataFrame(columns=["pk_partition"])
  for month in DATES[1:-1]:
    compras_month, no_compras_clean = get_month_subset(compras, pool_no_compras, month)
    sample_size = int(9 * compras_month.shape[0])
    if sample_size <= no_compras_clean.shape[0]:
      no_compras_sample = no_compras_clean.sample(sample_size)
      pool_no_compras = pool_no_compras.drop(no_compras_sample.index, axis=0)
      dataset_month = compras_month.append(no_compras_sample)
    else:
      compras_month_sample = compras_month.sample(int(1/9 * no_compras_clean.shape[0]))
      pool_no_compras = pool_no_compras.drop(no_compras_clean.index, axis=0)
      dataset_month = compras_month_sample.append(no_compras_clean)
    
    dataset_target = dataset_target.append(dataset_month)
  
  datasets[target] = dataset_target


(966, 2)
(8694, 2)
(473, 2)
(4257, 2)
(430, 2)
(3870, 2)
(2238, 2)
(3870, 2)
(337, 2)
(3870, 2)
(3435, 2)
(30915, 2)
(1811, 2)
(16299, 2)
(3114, 2)
(28026, 2)
(2327, 2)
(28026, 2)
(3463, 2)
(28026, 2)
(1564, 2)
(28026, 2)
(1510, 2)
(28026, 2)
(1777, 2)
(28026, 2)
(1363, 2)
(28026, 2)
(1285, 2)
(28026, 2)
(3931, 2)
(28026, 2)
(2492, 2)
(28026, 2)
(1860, 2)
(28026, 2)
(1841, 2)
(28026, 2)
(1928, 2)
(28026, 2)
(1936, 2)
(17424, 2)
(2484, 2)
(22356, 2)
(3386, 2)
(30474, 2)
(2862, 2)
(25758, 2)
(2436, 2)
(25758, 2)
(2129, 2)
(25758, 2)
(2469, 2)
(25758, 2)
(2518, 2)
(25758, 2)
(2084, 2)
(25758, 2)
(2002, 2)
(25758, 2)
(1617, 2)
(25758, 2)
(1369, 2)
(25758, 2)
(1374, 2)
(25758, 2)
(1284, 2)
(25758, 2)
(1455, 2)
(25758, 2)
(1444, 2)
(12996, 2)
(1095, 2)
(9855, 2)
(1319, 2)
(11871, 2)
(2030, 2)
(18270, 2)
(2229, 2)
(20061, 2)
(1746, 2)
(15714, 2)
(1748, 2)
(15732, 2)
(1680, 2)
(15120, 2)
(1861, 2)
(15120, 2)
(2257, 2)
(15120, 2)
(2216, 2)
(15120, 2)
(1424, 2)
(15120, 2)
(1332, 2)
(15120, 2)
(1

In [47]:
test = datasets["em_acount"]

In [48]:
pd.merge(test, df, left_on=[test.index, "pk_partition"], right_on=["pk_cid", "pk_partition"], how="left")

Unnamed: 0,pk_partition,compra,pk_cid,debit_card,em_acount,active_customer,country_id,region_code,gender,age,salary,cuentas,ahorro_inversion,financiacion,entry_year,entry_month,entry_day,entry_week,entry_weekday,entry_quarter,entry_channel_KHE,entry_channel_KFC,entry_channel_KHQ,entry_channel_KAT,entry_channel_KHK,entry_channel_KHM,entry_channel_Unknown,entry_channel_KHN,entry_channel_Others
0,2018-02-28,1.0,39997,0,0,1.0,15,288968.0,0,62,201575.01,0,1.0,0,2018,1,2,1,1,1,0,0,0,1,0,0,0,0,0
1,2018-02-28,1.0,55712,0,0,1.0,15,1162770.0,1,59,183352.59,0,1.0,0,2017,12,18,51,0,4,0,0,0,0,0,0,0,0,1
2,2018-02-28,1.0,97322,1,0,1.0,15,1162770.0,0,38,385518.33,1,1.0,1,2015,11,23,48,0,4,0,0,0,0,0,0,0,0,1
3,2018-02-28,1.0,138411,1,0,1.0,15,1162770.0,0,48,89954.76,0,0.0,1,2016,2,18,7,3,1,0,0,0,1,0,0,0,0,0
4,2018-02-28,1.0,177044,0,0,1.0,15,1162770.0,1,59,143059.50,0,1.0,0,2017,9,12,37,1,3,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149281,2019-04-28,0.0,1548193,0,0,0.0,15,30978.0,1,87,-9999.00,0,0.0,0,2019,4,29,18,0,2,0,0,0,0,0,0,1,0,0
149282,2019-04-28,0.0,1548198,0,0,0.0,15,1162770.0,0,50,-9999.00,0,0.0,0,2019,4,29,18,0,2,0,0,0,0,0,0,1,0,0
149283,2019-04-28,0.0,1548204,0,0,0.0,15,598170.0,0,54,-9999.00,0,0.0,0,2019,4,29,18,0,2,0,0,0,0,0,0,1,0,0
149284,2019-04-28,0.0,1548206,0,0,0.0,15,1162770.0,1,40,-9999.00,0,0.0,0,2019,4,29,18,0,2,0,0,0,0,0,0,1,0,0


In [49]:
# cruzamos los pk_cid y pk_partition con df para sacar los datos de los clientes en las particiones deseadas
for target, dataset in datasets.items():
  dataset = dataset.reset_index()
  dataset.columns = ["pk_cid", "pk_partition", "compra"]
  print(dataset.shape)
  datasets[target] = pd.merge(dataset, df, on=["pk_cid", "pk_partition"], how="left")
  # creamos copia de lista de targets y eliminamos el target actual de la lista
  # borramos los demás targets del dataset resampleado para mantener un único target
  datasets[target] = datasets[target].drop(TARGETS, axis=1)
  print(datasets[target].shape)
  print("")


(149286, 3)
(149286, 24)

(166518, 3)
(166518, 24)

(156236, 3)
(156236, 24)

(156076, 3)
(156076, 24)

(49960, 3)
(49960, 24)



In [50]:
# guardar los datasets en pkl para no tener que calcularlos cada vez
for target, dataset in datasets.items():
  file_name = f"{target}_df.pkl"
  dataset.to_pickle(f"/content/gdrive/MyDrive/EASY_MONEY_NUCLIO/recomendacion/{file_name}")

# Load Pickles

In [51]:
# cargamos los pickles
FILES = os.listdir("/content/gdrive/MyDrive/EASY_MONEY_NUCLIO/recomendacion/")
DATASETS = {}
for file_name in FILES:
  target, _ = file_name.split("_df.pkl")
  DATASETS[target] = pd.read_pickle(f"/content/gdrive/MyDrive/EASY_MONEY_NUCLIO/recomendacion/{file_name}")

In [58]:
# miramos que el target este bien balanceado (en el total)
for target, dataset in DATASETS.items():
  print(target)
  try:
    print(dataset["compra"].value_counts(dropna=False))
    print("")
  except:
    continue

em_acount
0.0    134362
1.0     14924
Name: compra, dtype: int64

financiacion
0.0    44964
1.0     4996
Name: compra, dtype: int64

debit_card
0.0    149871
1.0     16647
Name: compra, dtype: int64

cuentas
0.0    140616
1.0     15620
Name: compra, dtype: int64

ahorro_inversion
0.0    140471
1.0     15605
Name: compra, dtype: int64



In [None]:
# validaremos con las los clientes de abril (compras de mayo)

In [None]:
# mirar correlación variables

# Modelling

In [None]:
model = MultiOutputClassifier(xgb.XGBClassifier(
    max_depth=4,
    min_samples_leaf = 200,
    random_state=RANDOM_STATE
))

In [None]:
model.fit(X_train, y_train)

In [None]:
y_train_pred = dt.predict(X_train)
y_train_score = dt.predict_proba(X_train)[:,1]

#Predict the response for test dataset
y_pred = dt.predict(X_test)
y_test_score = dt.predict_proba(X_test)[:,1]

In [None]:
# probar meter la columna del indicador del cluster de la primera segmentación