In [234]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [2]:
path = './'

df_est = pd.read_csv(path + 'Input1_clientes_estructura.csv', sep=';')
df_venta = pd.read_csv(path + 'Input2_clientes_venta.csv', sep=';')

<h1>Exploracion</h1>

In [3]:
df_est.head()

Unnamed: 0,Cliente,Regional2,Gerencia2,SubCanal2,Categoria,Nevera
0,1,Regional 1,Gerencia_1,Subcanal_1,Categoria_1,0
1,2,Regional 1,Gerencia_1,Subcanal_1,Categoria_1,0
2,3,Regional 1,Gerencia_1,Subcanal_1,Categoria_1,0
3,4,Regional 1,Gerencia_1,Subcanal_1,Categoria_1,1
4,5,Regional 1,Gerencia_1,Subcanal_1,Categoria_2,1


In [4]:
df_venta.head()

Unnamed: 0,Año,Mes,Cliente,SegmentoPrecio2,Marca2,Cupo2,CapacidadEnvase2,Volumen,disc,nr
0,2019,5,10,SegmentoPrecio_1,Marca_1,Cupo_1,CapacidadEnvase_12,0.112229,-30.590603,900.328567
1,2019,5,10,SegmentoPrecio_1,Marca_2,Cupo_2,CapacidadEnvase_10,0.021734,0.0,149.184463
2,2019,5,10,SegmentoPrecio_2,Marca_3,Cupo_2,CapacidadEnvase_10,0.043469,0.0,359.625828
3,2019,5,10,SegmentoPrecio_1,Marca_1,Cupo_2,CapacidadEnvase_16,0.026345,-31.065261,134.748399
4,2019,5,10,SegmentoPrecio_1,Marca_4,Cupo_2,CapacidadEnvase_10,0.086938,0.0,496.901005


In [5]:
df_venta.isnull().sum()

Año                 0
Mes                 0
Cliente             0
SegmentoPrecio2     0
Marca2              0
Cupo2               0
CapacidadEnvase2    0
Volumen             0
disc                0
nr                  0
dtype: int64

In [6]:
df_venta['Año'].unique()
df_venta.groupby('Año')['Mes'].unique()

Año
2019    [5, 6, 7, 9, 8, 10, 11, 12]
2020    [1, 2, 3, 5, 6, 7, 8, 9, 4]
Name: Mes, dtype: object

<h1>FEATURE ENGINEERING</h1>

In [7]:
# vamos a realizar una copia para dejar la data original tal cual esta
df_new_venta = df_venta.copy()
df_new_venta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1406116 entries, 0 to 1406115
Data columns (total 10 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Año               1406116 non-null  int64  
 1   Mes               1406116 non-null  int64  
 2   Cliente           1406116 non-null  int64  
 3   SegmentoPrecio2   1406116 non-null  object 
 4   Marca2            1406116 non-null  object 
 5   Cupo2             1406116 non-null  object 
 6   CapacidadEnvase2  1406116 non-null  object 
 7   Volumen           1406116 non-null  float64
 8   disc              1406116 non-null  float64
 9   nr                1406116 non-null  float64
dtypes: float64(3), int64(3), object(4)
memory usage: 107.3+ MB


In [8]:
# vamos a unir las tres columnas de Marca2, Cupo2, CapacidadEnvase2 an una Sola
# de esta manera al realizar el filtro nos quedara mas facil
def createProduct(x):
    result = x['Marca2'] + '-' + x['Cupo2'] + '-' + x['CapacidadEnvase2']
    return result
df_new_venta['producto'] = df_new_venta.apply(createProduct, axis=1)
df_new_venta.drop(['Marca2', 'Cupo2', 'CapacidadEnvase2'], axis=1, inplace=True)
df_new_venta

Unnamed: 0,Año,Mes,Cliente,SegmentoPrecio2,Volumen,disc,nr,producto
0,2019,5,10,SegmentoPrecio_1,0.112229,-30.590603,900.328567,Marca_1-Cupo_1-CapacidadEnvase_12
1,2019,5,10,SegmentoPrecio_1,0.021734,0.000000,149.184463,Marca_2-Cupo_2-CapacidadEnvase_10
2,2019,5,10,SegmentoPrecio_2,0.043469,0.000000,359.625828,Marca_3-Cupo_2-CapacidadEnvase_10
3,2019,5,10,SegmentoPrecio_1,0.026345,-31.065261,134.748399,Marca_1-Cupo_2-CapacidadEnvase_16
4,2019,5,10,SegmentoPrecio_1,0.086938,0.000000,496.901005,Marca_4-Cupo_2-CapacidadEnvase_10
...,...,...,...,...,...,...,...,...
1406111,2020,8,20577,SegmentoPrecio_3,0.039122,0.000000,544.150314,Marca_39-Cupo_2-CapacidadEnvase_10
1406112,2020,7,20580,SegmentoPrecio_1,0.000000,0.000000,0.000000,Marca_38-Cupo_2-CapacidadEnvase_10
1406113,2020,7,20580,SegmentoPrecio_3,0.058683,0.000000,759.218996,Marca_39-Cupo_2-CapacidadEnvase_10
1406114,2020,9,20580,SegmentoPrecio_1,0.000000,0.000000,0.000000,Marca_38-Cupo_2-CapacidadEnvase_10


In [9]:
# de nuevo vamos a realizar una copia para mantener df_new_venta tal cual
# ya que nos puede servir para mas adelante
df_ventas_2 = df_new_venta.drop(['SegmentoPrecio2', 'Volumen', 'disc', 'nr'], axis=1)
df_ventas_2 = df_ventas_2.merge(df_est, on=['Cliente'])
df_ventas_2.head()

Unnamed: 0,Año,Mes,Cliente,producto,Regional2,Gerencia2,SubCanal2,Categoria,Nevera
0,2019,5,10,Marca_1-Cupo_1-CapacidadEnvase_12,Regional 1,Gerencia_3,Subcanal_1,Categoria_2,1
1,2019,5,10,Marca_2-Cupo_2-CapacidadEnvase_10,Regional 1,Gerencia_3,Subcanal_1,Categoria_2,1
2,2019,5,10,Marca_3-Cupo_2-CapacidadEnvase_10,Regional 1,Gerencia_3,Subcanal_1,Categoria_2,1
3,2019,5,10,Marca_1-Cupo_2-CapacidadEnvase_16,Regional 1,Gerencia_3,Subcanal_1,Categoria_2,1
4,2019,5,10,Marca_4-Cupo_2-CapacidadEnvase_10,Regional 1,Gerencia_3,Subcanal_1,Categoria_2,1


<h1>RandomForest</h1>


In [10]:
# Aqui vamos a realizar un nuevo feature Engineering donde cambiaremos a 0 y 1 si
# la marca a predecir fue comprada o no

def marca_uno(x):
  if str(x) == 'Marca_20-Cupo_3-CapacidadEnvase_9':
    return 1
  else:
    return 0

def marca_dos(x):
  if str(x) == 'Marca_16-Cupo_2-CapacidadEnvase_10':
    return 1
  else:
    return 0

def marca_tres(x):
  if str(x) == 'Marca_9-Cupo_3-CapacidadEnvase_12':
    return 1
  else:
    return 0

def marca_Inno_uno(x):
  if str(x) == 'Marca_38-Cupo_2-CapacidadEnvase_10':
    return 1
  else:
    return 0

def marca_Inno_dos(x):
  if str(x) == 'Marca_39-Cupo_2-CapacidadEnvase_10':
    return 1
  else:
    return 0

In [11]:
df_ventas_2['Marca1'] = df_ventas_2['producto'].apply(marca_uno)
df_ventas_2['Marca2'] = df_ventas_2['producto'].apply(marca_dos)
df_ventas_2['Marca3'] = df_ventas_2['producto'].apply(marca_tres)
df_ventas_2['MarcaInno1'] = df_ventas_2['producto'].apply(marca_Inno_uno)
df_ventas_2['MarcaInno2'] = df_ventas_2['producto'].apply(marca_Inno_dos)

In [12]:
aux = df_ventas_2[['Año', 'Mes', 'Cliente', 'Marca1', 'Marca2', 'Marca3', 'MarcaInno1', 'MarcaInno2']]
aux_group = aux.groupby(['Año', 'Mes', 'Cliente']).sum()
result = aux_group.reset_index()
result

Unnamed: 0,Año,Mes,Cliente,Marca1,Marca2,Marca3,MarcaInno1,MarcaInno2
0,2019,5,9,0,0,0,0,0
1,2019,5,10,0,0,0,0,0
2,2019,5,11,0,0,0,0,0
3,2019,5,12,0,1,0,0,0
4,2019,5,13,0,0,0,0,0
...,...,...,...,...,...,...,...,...
204631,2020,9,20411,0,0,0,1,1
204632,2020,9,20414,0,0,0,1,1
204633,2020,9,20434,0,0,0,1,1
204634,2020,9,20456,0,0,0,1,1


In [13]:
# este proceso se tiene que realizar para pasarlo por el RandomForest sin problemas
cliente_info = df_est.copy()

cliente_info['Regional2'] = cliente_info['Regional2'].astype('category')
cliente_info['Gerencia2'] = cliente_info['Gerencia2'].astype('category')
cliente_info['SubCanal2'] = cliente_info['SubCanal2'].astype('category')
cliente_info['Categoria'] = cliente_info['Categoria'].astype('category')

cliente_info['Regional2'] = cliente_info.Regional2.cat.codes
cliente_info['Gerencia2'] = cliente_info.Gerencia2.cat.codes
cliente_info['SubCanal2'] = cliente_info.SubCanal2.cat.codes
cliente_info['Categoria'] = cliente_info.Categoria.cat.codes

primer producto

In [14]:
primer_producto = result[['Año', 'Mes', 'Cliente', 'Marca1']]

In [15]:
primer_producto = primer_producto.merge(cliente_info, on='Cliente')
primer_producto

Unnamed: 0,Año,Mes,Cliente,Marca1,Regional2,Gerencia2,SubCanal2,Categoria,Nevera
0,2019,5,9,0,0,3,22,2,1
1,2019,6,9,0,0,3,22,2,1
2,2019,7,9,0,0,3,22,2,1
3,2019,8,9,0,0,3,22,2,1
4,2019,9,9,0,0,3,22,2,1
...,...,...,...,...,...,...,...,...,...
204631,2020,9,20186,0,0,10,29,0,0
204632,2020,9,20309,0,0,1,22,2,0
204633,2020,9,20312,0,0,7,27,1,0
204634,2020,9,20434,0,0,1,24,2,0


In [16]:
# primer Marca a predecir
y = primer_producto['Marca1']
x = primer_producto[['Año', 'Mes', 'Cliente', 'Regional2', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera']]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [18]:
clf1 = RandomForestClassifier(max_depth=2, random_state=42)
clf1.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [19]:
answer = clf1.predict(X_test)
ans = clf1.predict_proba(X_test)
print(ans)
print(clf1.classes_)

[[0.992006   0.007994  ]
 [0.98616139 0.01383861]
 [0.97807069 0.02192931]
 ...
 [0.94371954 0.05628046]
 [0.98602662 0.01397338]
 [0.98382524 0.01617476]]
[0 1]


In [20]:
acc_score = accuracy_score(y_test, answer)
print(acc_score)
roc = roc_curve(y_test, answer)
print(roc)
auc_score = auc(roc[0], roc[1])
print(auc_score)

0.9736121970289289
(array([0., 1.]), array([0., 1.]), array([1, 0]))
0.5


segundo producto



In [21]:
segundo_producto = result[['Año', 'Mes', 'Cliente', 'Marca2']]
segundo_producto = segundo_producto.merge(cliente_info, on='Cliente')

# separacion de data y separacion entre train y test data
y = segundo_producto['Marca2']
x = segundo_producto[['Año', 'Mes', 'Cliente', 'Regional2', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera']]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Modelo
clf2 = RandomForestClassifier(max_depth=2, random_state=42)
clf2.fit(X_train, y_train)

# Medidas
answer = clf2.predict(X_test)
acc_score = accuracy_score(y_test, answer)
print(acc_score)
roc = roc_curve(y_test, answer)
print(roc)
auc_score = auc(roc[0], roc[1])
print(auc_score)

0.9521354573885848
(array([0., 1.]), array([0., 1.]), array([1, 0]))
0.5


tercer producto

In [22]:
tercer_producto = result[['Año', 'Mes', 'Cliente', 'Marca3']]
tercer_producto = tercer_producto.merge(cliente_info, on='Cliente')

# separacion de data y separacion entre train y test data
y = tercer_producto['Marca3']
x = tercer_producto[['Año', 'Mes', 'Cliente', 'Regional2', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera']]

# Modelo
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
clf3 = RandomForestClassifier(max_depth=2, random_state=42)
clf3.fit(X_train, y_train)

# Medidas
answer = clf3.predict(X_test)
acc_score = accuracy_score(y_test, answer)
print(acc_score)
roc = roc_curve(y_test, answer)
print(roc)
auc_score = auc(roc[0], roc[1])
print(auc_score)

0.7238809616888194
(array([0., 1.]), array([0., 1.]), array([1, 0]))
0.5


cuarto producto

In [23]:
cuarto_producto = result[['Año', 'Mes', 'Cliente', 'MarcaInno1']]
cuarto_producto = cuarto_producto.merge(cliente_info, on='Cliente')

# separacion de data y separacion entre train y test data
y = cuarto_producto['MarcaInno1']
x = cuarto_producto[['Año', 'Mes', 'Cliente', 'Regional2', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera']]

# Modelo
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
clf4 = RandomForestClassifier(max_depth=2, random_state=42)
clf4.fit(X_train, y_train)

# Medidas
answer = clf4.predict(X_test)
acc_score = accuracy_score(y_test, answer)
print(acc_score)
roc = roc_curve(y_test, answer)
print(roc)
auc_score = auc(roc[0], roc[1])
print(auc_score)

0.9568999218139171
(array([0., 1.]), array([0., 1.]), array([1, 0]))
0.5


quinto producto

In [24]:
quinto_producto = result[['Año', 'Mes', 'Cliente', 'MarcaInno2']]
quinto_producto = quinto_producto.merge(cliente_info, on='Cliente')

# separacion de data y separacion entre train y test data
y = quinto_producto['MarcaInno2']
x = quinto_producto[['Año', 'Mes', 'Cliente', 'Regional2', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera']]

# Modelo
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
clf5 = RandomForestClassifier(max_depth=2, random_state=42)
clf5.fit(X_train, y_train)

answer = clf5.predict(X_test)
acc_score = accuracy_score(y_test, answer)
print(acc_score)
roc = roc_curve(y_test, answer)
print(roc)
auc_score = auc(roc[0], roc[1])
print(auc_score)

0.9568999218139171
(array([0., 1.]), array([0., 1.]), array([1, 0]))
0.5


LLENANDO TEST

In [25]:
df_test = pd.read_csv(path + 'Input3_clientes_test.csv', sep=';')

In [26]:
df_test

Unnamed: 0,Cliente,Marca1,Marca2,Marca3,Marca_Inno1,Marca_Inno2
0,10,,,,,
1,12,,,,,
2,14,,,,,
3,15,,,,,
4,18,,,,,
...,...,...,...,...,...,...
9297,20186,,,,,
9298,20261,,,,,
9299,20309,,,,,
9300,20360,,,,,


In [27]:
test = df_test[['Cliente']]
test = test.merge(df_est, on='Cliente')
test['Año'] = 2020
test['Mes'] = 10
test = test[['Año', 'Mes', 'Cliente', 'Regional2', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera']]
test

Unnamed: 0,Año,Mes,Cliente,Regional2,Gerencia2,SubCanal2,Categoria,Nevera
0,2020,10,10,Regional 1,Gerencia_3,Subcanal_1,Categoria_2,1
1,2020,10,12,Regional 1,Gerencia_3,Subcanal_4,Categoria_4,1
2,2020,10,14,Regional 1,Gerencia_3,Subcanal_1,Categoria_2,1
3,2020,10,15,Regional 1,Gerencia_2,Subcanal_3,Categoria_2,1
4,2020,10,18,Regional 1,Gerencia_3,Subcanal_1,Categoria_5,1
...,...,...,...,...,...,...,...,...
9297,2020,10,20186,Regional 1,Gerencia_9,Subcanal_7,Categoria_1,0
9298,2020,10,20261,Regional 1,Gerencia_10,Subcanal_1,Categoria_2,0
9299,2020,10,20309,Regional 1,Gerencia_10,Subcanal_3,Categoria_3,0
9300,2020,10,20360,Regional 1,Gerencia_10,Subcanal_8,Categoria_1,0


In [28]:
test['Regional2'] = test['Regional2'].astype('category')
test['Gerencia2'] = test['Gerencia2'].astype('category')
test['SubCanal2'] = test['SubCanal2'].astype('category')
test['Categoria'] = test['Categoria'].astype('category')

test['Regional2'] = test.Regional2.cat.codes
test['Gerencia2'] = test.Gerencia2.cat.codes
test['SubCanal2'] = test.SubCanal2.cat.codes
test['Categoria'] = test.Categoria.cat.codes

In [29]:
prob1 = clf1.predict_proba(test)[:, 1]
prob2 = clf2.predict_proba(test)[:, 1]
prob3 = clf3.predict_proba(test)[:, 1]
prob4 = clf4.predict_proba(test)[:, 1]
prob5 = clf5.predict_proba(test)[:, 1]

In [30]:
test['Marca1'] = prob1
test['Marca2'] = prob2
test['Marca3'] = prob3
test['Marca_Inno1'] = prob4
test['Marca_Inno2'] = prob5

In [31]:
final = test[['Cliente', 'Marca1', 'Marca2', 'Marca3', 'Marca_Inno1', 'Marca_Inno2']]
final

Unnamed: 0,Cliente,Marca1,Marca2,Marca3,Marca_Inno1,Marca_Inno2
0,10,0.055393,0.050081,0.256060,0.084214,0.084214
1,12,0.053019,0.080051,0.302099,0.089676,0.089676
2,14,0.055393,0.050081,0.256060,0.084214,0.084214
3,15,0.052316,0.048440,0.251948,0.084214,0.084214
4,18,0.056569,0.081615,0.306210,0.089874,0.089874
...,...,...,...,...,...,...
9297,20186,0.084881,0.025497,0.211585,0.216928,0.216928
9298,20261,0.090971,0.032706,0.220766,0.210166,0.210166
9299,20309,0.089158,0.031441,0.263293,0.220889,0.220889
9300,20360,0.088400,0.026031,0.216274,0.210332,0.210332


In [32]:
df_test.columns

Index(['Cliente', 'Marca1', 'Marca2', 'Marca3', 'Marca_Inno1', 'Marca_Inno2'], dtype='object')

In [33]:
final.columns = df_test.columns
final

Unnamed: 0,Cliente,Marca1,Marca2,Marca3,Marca_Inno1,Marca_Inno2
0,10,0.055393,0.050081,0.256060,0.084214,0.084214
1,12,0.053019,0.080051,0.302099,0.089676,0.089676
2,14,0.055393,0.050081,0.256060,0.084214,0.084214
3,15,0.052316,0.048440,0.251948,0.084214,0.084214
4,18,0.056569,0.081615,0.306210,0.089874,0.089874
...,...,...,...,...,...,...
9297,20186,0.084881,0.025497,0.211585,0.216928,0.216928
9298,20261,0.090971,0.032706,0.220766,0.210166,0.210166
9299,20309,0.089158,0.031441,0.263293,0.220889,0.220889
9300,20360,0.088400,0.026031,0.216274,0.210332,0.210332


In [34]:
final.columns

Index(['Cliente', 'Marca1', 'Marca2', 'Marca3', 'Marca_Inno1', 'Marca_Inno2'], dtype='object')

In [35]:
final.to_csv('input4_clientes_test.csv', sep=',', encoding='utf-8', index=False)

In [36]:
prueba = pd.read_csv(path + 'input4_clientes_test.csv', sep=',')
prueba

Unnamed: 0,Cliente,Marca1,Marca2,Marca3,Marca_Inno1,Marca_Inno2
0,10,0.055393,0.050081,0.256060,0.084214,0.084214
1,12,0.053019,0.080051,0.302099,0.089676,0.089676
2,14,0.055393,0.050081,0.256060,0.084214,0.084214
3,15,0.052316,0.048440,0.251948,0.084214,0.084214
4,18,0.056569,0.081615,0.306210,0.089874,0.089874
...,...,...,...,...,...,...
9297,20186,0.084881,0.025497,0.211585,0.216928,0.216928
9298,20261,0.090971,0.032706,0.220766,0.210166,0.210166
9299,20309,0.089158,0.031441,0.263293,0.220889,0.220889
9300,20360,0.088400,0.026031,0.216274,0.210332,0.210332


<h1>XGBOOST</h1>

producto uno

In [37]:
y = primer_producto['Marca1']
x = primer_producto[['Año', 'Mes', 'Cliente', 'Regional2', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera']]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [38]:
xgb1 = XGBClassifier(max_depth=9, n_estimators=500, learning_rate=0.9, random_state=42)
xgb1.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.9, max_delta_step=0, max_depth=9,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [39]:
answer = xgb1.predict(X_test)
acc_score = accuracy_score(y_test, answer)
print(acc_score)
roc = roc_curve(y_test, answer)
print(roc)
auc_score = auc(roc[0], roc[1])
print(auc_score)

0.971706411258796
(array([0.        , 0.00971191, 1.        ]), array([0.        , 0.28611111, 1.        ]), array([2, 1, 0]))
0.6381996029355997


producto dos

In [None]:
y = segundo_producto['Marca2']
x = segundo_producto[['Año', 'Mes', 'Cliente', 'Regional2', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera']]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [40]:


# Modelo
xgb2 = XGBClassifier(max_depth=9, n_estimators=500, learning_rate=0.9, random_state=42)
xgb2.fit(X_train, y_train)

# Medidas
answer = xgb2.predict(X_test)
acc_score = accuracy_score(y_test, answer)
print(acc_score)
roc = roc_curve(y_test, answer)
print(roc)
auc_score = auc(roc[0], roc[1])
print(auc_score)

0.9601739640344019
(array([0.        , 0.01442172, 1.        ]), array([0.        , 0.45482389, 1.        ]), array([2, 1, 0]))
0.7202010849555408


producto tres

In [41]:
y = tercer_producto['Marca3']
x = tercer_producto[['Año', 'Mes', 'Cliente', 'Regional2', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera']]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Modelo
xgb3 = XGBClassifier(max_depth=9, n_estimators=500, learning_rate=0.9, random_state=42)
xgb3.fit(X_train, y_train)

# Medidas
answer = xgb3.predict(X_test)
acc_score = accuracy_score(y_test, answer)
print(acc_score)
roc = roc_curve(y_test, answer)
print(roc)
auc_score = auc(roc[0], roc[1])
print(auc_score)

0.8036796325254105
(array([0.        , 0.11874304, 1.        ]), array([0.        , 0.60030086, 1.        ]), array([2, 1, 0]))
0.7407789099432296


product cuatro

In [42]:
y = cuarto_producto['MarcaInno1']
x = cuarto_producto[['Año', 'Mes', 'Cliente', 'Regional2', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera']]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Modelo
xgb4 = XGBClassifier(max_depth=9, n_estimators=500, learning_rate=0.9, random_state=42)
xgb4.fit(X_train, y_train)

# Medidas
answer = xgb4.predict(X_test)
acc_score = accuracy_score(y_test, answer)
print(acc_score)
roc = roc_curve(y_test, answer)
print(roc)
auc_score = auc(roc[0], roc[1])
print(auc_score)

0.9653782251759186
(array([0.        , 0.01521806, 1.        ]), array([0.       , 0.5345805, 1.       ]), array([2, 1, 0]))
0.7596812207332803


producto cinco

In [43]:
y = quinto_producto['MarcaInno2']
x = quinto_producto[['Año', 'Mes', 'Cliente', 'Regional2', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera']]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Modelo
xgb5 = XGBClassifier(max_depth=9, n_estimators=500, learning_rate=0.9, random_state=42)
xgb5.fit(X_train, y_train)

# Medidas
answer = xgb5.predict(X_test)
acc_score = accuracy_score(y_test, answer)
print(acc_score)
roc = roc_curve(y_test, answer)
print(roc)
auc_score = auc(roc[0], roc[1])
print(auc_score)

0.9653782251759186
(array([0.        , 0.01521806, 1.        ]), array([0.       , 0.5345805, 1.       ]), array([2, 1, 0]))
0.7596812207332803


LLENAR TEST

In [44]:
test = df_test[['Cliente']]
test = test.merge(df_est, on='Cliente')
test['Año'] = 2020
test['Mes'] = 10
test = test[['Año', 'Mes', 'Cliente', 'Regional2', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera']]

test['Regional2'] = test['Regional2'].astype('category')
test['Gerencia2'] = test['Gerencia2'].astype('category')
test['SubCanal2'] = test['SubCanal2'].astype('category')
test['Categoria'] = test['Categoria'].astype('category')

test['Regional2'] = test.Regional2.cat.codes
test['Gerencia2'] = test.Gerencia2.cat.codes
test['SubCanal2'] = test.SubCanal2.cat.codes
test['Categoria'] = test.Categoria.cat.codes

In [45]:
prob1 = xgb1.predict_proba(test)[:, 1]
prob2 = xgb2.predict_proba(test)[:, 1]
prob3 = xgb3.predict_proba(test)[:, 1]
prob4 = xgb4.predict_proba(test)[:, 1]
prob5 = xgb5.predict_proba(test)[:, 1]

In [46]:

test['Marca1'] = prob1
test['Marca2'] = prob2
test['Marca3'] = prob3
test['Marca_Inno1'] = prob4
test['Marca_Inno2'] = prob5

In [47]:
final = test[['Cliente', 'Marca1', 'Marca2', 'Marca3', 'Marca_Inno1', 'Marca_Inno2']]
final.columns = df_test.columns
final.to_csv('input4_clientes_test.csv', sep=',', encoding='utf-8', index=False)

Nueva Estrategia

In [48]:
df_new_venta = df_venta.copy()
def createProduct(x):
    result = x['Marca2'] + '-' + x['Cupo2'] + '-' + x['CapacidadEnvase2']
    return result
df_new_venta['producto'] = df_new_venta.apply(createProduct, axis=1)
df_new_venta

Unnamed: 0,Año,Mes,Cliente,SegmentoPrecio2,Marca2,Cupo2,CapacidadEnvase2,Volumen,disc,nr,producto
0,2019,5,10,SegmentoPrecio_1,Marca_1,Cupo_1,CapacidadEnvase_12,0.112229,-30.590603,900.328567,Marca_1-Cupo_1-CapacidadEnvase_12
1,2019,5,10,SegmentoPrecio_1,Marca_2,Cupo_2,CapacidadEnvase_10,0.021734,0.000000,149.184463,Marca_2-Cupo_2-CapacidadEnvase_10
2,2019,5,10,SegmentoPrecio_2,Marca_3,Cupo_2,CapacidadEnvase_10,0.043469,0.000000,359.625828,Marca_3-Cupo_2-CapacidadEnvase_10
3,2019,5,10,SegmentoPrecio_1,Marca_1,Cupo_2,CapacidadEnvase_16,0.026345,-31.065261,134.748399,Marca_1-Cupo_2-CapacidadEnvase_16
4,2019,5,10,SegmentoPrecio_1,Marca_4,Cupo_2,CapacidadEnvase_10,0.086938,0.000000,496.901005,Marca_4-Cupo_2-CapacidadEnvase_10
...,...,...,...,...,...,...,...,...,...,...,...
1406111,2020,8,20577,SegmentoPrecio_3,Marca_39,Cupo_2,CapacidadEnvase_10,0.039122,0.000000,544.150314,Marca_39-Cupo_2-CapacidadEnvase_10
1406112,2020,7,20580,SegmentoPrecio_1,Marca_38,Cupo_2,CapacidadEnvase_10,0.000000,0.000000,0.000000,Marca_38-Cupo_2-CapacidadEnvase_10
1406113,2020,7,20580,SegmentoPrecio_3,Marca_39,Cupo_2,CapacidadEnvase_10,0.058683,0.000000,759.218996,Marca_39-Cupo_2-CapacidadEnvase_10
1406114,2020,9,20580,SegmentoPrecio_1,Marca_38,Cupo_2,CapacidadEnvase_10,0.000000,0.000000,0.000000,Marca_38-Cupo_2-CapacidadEnvase_10


In [49]:
df_new_venta['SegmentoPrecio2'] = df_new_venta['SegmentoPrecio2'].astype('category')
df_new_venta['Marca2'] = df_new_venta['Marca2'].astype('category')
df_new_venta['Cupo2'] = df_new_venta['Cupo2'].astype('category')
df_new_venta['CapacidadEnvase2'] = df_new_venta['CapacidadEnvase2'].astype('category')


df_new_venta['SegmentoPrecio2'] = df_new_venta.SegmentoPrecio2.cat.codes
df_new_venta['Marca2'] = df_new_venta.Marca2.cat.codes
df_new_venta['Cupo2'] = df_new_venta.Cupo2.cat.codes
df_new_venta['CapacidadEnvase2'] = df_new_venta.CapacidadEnvase2.cat.codes

df_new_venta

Unnamed: 0,Año,Mes,Cliente,SegmentoPrecio2,Marca2,Cupo2,CapacidadEnvase2,Volumen,disc,nr,producto
0,2019,5,10,0,0,0,3,0.112229,-30.590603,900.328567,Marca_1-Cupo_1-CapacidadEnvase_12
1,2019,5,10,0,11,1,1,0.021734,0.000000,149.184463,Marca_2-Cupo_2-CapacidadEnvase_10
2,2019,5,10,1,22,1,1,0.043469,0.000000,359.625828,Marca_3-Cupo_2-CapacidadEnvase_10
3,2019,5,10,0,0,1,7,0.026345,-31.065261,134.748399,Marca_1-Cupo_2-CapacidadEnvase_16
4,2019,5,10,0,33,1,1,0.086938,0.000000,496.901005,Marca_4-Cupo_2-CapacidadEnvase_10
...,...,...,...,...,...,...,...,...,...,...,...
1406111,2020,8,20577,2,32,1,1,0.039122,0.000000,544.150314,Marca_39-Cupo_2-CapacidadEnvase_10
1406112,2020,7,20580,0,31,1,1,0.000000,0.000000,0.000000,Marca_38-Cupo_2-CapacidadEnvase_10
1406113,2020,7,20580,2,32,1,1,0.058683,0.000000,759.218996,Marca_39-Cupo_2-CapacidadEnvase_10
1406114,2020,9,20580,0,31,1,1,0.000000,0.000000,0.000000,Marca_38-Cupo_2-CapacidadEnvase_10


In [53]:
df_2019 = df_new_venta[df_new_venta['Año'] == 2019]
df_2020 = df_new_venta[df_new_venta['Año'] == 2020]

In [54]:
df_2019.head()

Unnamed: 0,Año,Mes,Cliente,SegmentoPrecio2,Marca2,Cupo2,CapacidadEnvase2,Volumen,disc,nr,producto
0,2019,5,10,0,0,0,3,0.112229,-30.590603,900.328567,Marca_1-Cupo_1-CapacidadEnvase_12
1,2019,5,10,0,11,1,1,0.021734,0.0,149.184463,Marca_2-Cupo_2-CapacidadEnvase_10
2,2019,5,10,1,22,1,1,0.043469,0.0,359.625828,Marca_3-Cupo_2-CapacidadEnvase_10
3,2019,5,10,0,0,1,7,0.026345,-31.065261,134.748399,Marca_1-Cupo_2-CapacidadEnvase_16
4,2019,5,10,0,33,1,1,0.086938,0.0,496.901005,Marca_4-Cupo_2-CapacidadEnvase_10


In [55]:
df_2019.drop('Año', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [56]:
df_2019.head()

Unnamed: 0,Mes,Cliente,SegmentoPrecio2,Marca2,Cupo2,CapacidadEnvase2,Volumen,disc,nr,producto
0,5,10,0,0,0,3,0.112229,-30.590603,900.328567,Marca_1-Cupo_1-CapacidadEnvase_12
1,5,10,0,11,1,1,0.021734,0.0,149.184463,Marca_2-Cupo_2-CapacidadEnvase_10
2,5,10,1,22,1,1,0.043469,0.0,359.625828,Marca_3-Cupo_2-CapacidadEnvase_10
3,5,10,0,0,1,7,0.026345,-31.065261,134.748399,Marca_1-Cupo_2-CapacidadEnvase_16
4,5,10,0,33,1,1,0.086938,0.0,496.901005,Marca_4-Cupo_2-CapacidadEnvase_10


In [64]:
unique_mes = list(df_2019['Mes'].unique())
unique_cliente = list(df_2019['Cliente'].unique())

list_dict = []
for mes in unique_mes:
  for cliente in unique_cliente:
    aux = df_2019[(df_2019['Mes'] == mes) & (df_2019['Cliente'] == cliente)]
    if len(aux) > 0:
      dicty = {'Año': 2019, 'Mes': mes, 'Cliente': cliente}
      Segmento = aux['SegmentoPrecio2'].value_counts()
      Segmento_index = list(Segmento.index)
      Segmento_values = np.array(list(Segmento.values))
      best_segmento = np.argmax(Segmento_values)
      best_index = Segmento_index[best_segmento]
      dicty['SegmentoPrecio2'] = best_index

      marca = aux['Marca2'].value_counts()
      marca_index = list(marca.index)
      marca_values = np.array(list(marca.values))
      best_marca = np.argmax(marca_values)
      best_index = marca_index[best_marca]
      dicty['Marca2'] = best_index


      cupo = aux['Cupo2'].value_counts()
      cupo_index = list(cupo.index)
      cupo_values = np.array(list(cupo.values))
      best_cupo = np.argmax(cupo_values)
      best_index = cupo_index[best_cupo]
      dicty['Cupo2'] = best_index

      list_dict.append(dicty)

In [65]:
df_new_2019 = pd.DataFrame(list_dict)
df_new_2019.head()

Unnamed: 0,Año,Mes,Cliente,SegmentoPrecio2,Marca2,Cupo2
0,2019,5,10,0,0,1
1,2019,5,12,1,36,1
2,2019,5,14,0,0,1
3,2019,5,15,0,34,1
4,2019,5,18,0,0,1


In [66]:
df_new_2019.isnull().sum()

Año                0
Mes                0
Cliente            0
SegmentoPrecio2    0
Marca2             0
Cupo2              0
dtype: int64

In [67]:
unique_mes = list(df_2020['Mes'].unique())
unique_cliente = list(df_2020['Cliente'].unique())

list_dict = []
for mes in unique_mes:
  for cliente in unique_cliente:
    aux = df_2020[(df_2020['Mes'] == mes) & (df_2020['Cliente'] == cliente)]
    if len(aux) > 0:
      dicty = {'Año': 2020, 'Mes': mes, 'Cliente': cliente}
      Segmento = aux['SegmentoPrecio2'].value_counts()
      Segmento_index = list(Segmento.index)
      Segmento_values = np.array(list(Segmento.values))
      best_segmento = np.argmax(Segmento_values)
      best_index = Segmento_index[best_segmento]
      dicty['SegmentoPrecio2'] = best_index

      marca = aux['Marca2'].value_counts()
      marca_index = list(marca.index)
      marca_values = np.array(list(marca.values))
      best_marca = np.argmax(marca_values)
      best_index = marca_index[best_marca]
      dicty['Marca2'] = best_index


      cupo = aux['Cupo2'].value_counts()
      cupo_index = list(cupo.index)
      cupo_values = np.array(list(cupo.values))
      best_cupo = np.argmax(cupo_values)
      best_index = cupo_index[best_cupo]
      dicty['Cupo2'] = best_index

      list_dict.append(dicty)

In [68]:
df_new_2020 = pd.DataFrame(list_dict)
df_new_2020.head()

Unnamed: 0,Año,Mes,Cliente,SegmentoPrecio2,Marca2,Cupo2
0,2020,1,10,0,0,1
1,2020,1,12,0,33,1
2,2020,1,14,0,0,1
3,2020,1,15,0,34,1
4,2020,1,18,0,0,1


In [70]:
df_new_venta_2 = pd.concat([df_new_2019, df_new_2020], ignore_index=True)

In [71]:
df_new_venta_2.head()

Unnamed: 0,Año,Mes,Cliente,SegmentoPrecio2,Marca2,Cupo2
0,2019,5,10,0,0,1
1,2019,5,12,1,36,1
2,2019,5,14,0,0,1
3,2019,5,15,0,34,1
4,2019,5,18,0,0,1


In [78]:
unique_cliente = list(df_new_venta_2['Cliente'].unique())

list_dict = []
for cliente in unique_cliente:
  cliente_df = df_new_venta_2[df_new_venta_2['Cliente'] == cliente]
  dicty = {'Cliente': cliente}
  marcas = cliente_df['Marca2'].value_counts()
  marcas_index = list(marcas.index)
  marcas_values = list(marcas.values)

  union = list(zip(marcas_index, marcas_values))
  union_sort = sorted(union, key=lambda x: x[1], reverse=True)
  for i in range(3):
    if i < len(union_sort):
      first = union_sort[i]
    dicty['fav{}'.format(i + 1)] = first[0]

  list_dict.append(dicty)


In [79]:
df_fav = pd.DataFrame(list_dict)
df_fav.head()

Unnamed: 0,Cliente,fav1,fav2,fav3
0,10,0,34,33
1,12,3,35,36
2,14,0,34,34
3,15,34,0,33
4,18,0,34,34


In [80]:
cliente_info = cliente_info.merge(df_fav, on='Cliente')
cliente_info.head()

Unnamed: 0,Cliente,Regional2,Gerencia2,SubCanal2,Categoria,Nevera,fav1,fav2,fav3
0,1,0,0,0,0,0,33,31,11
1,2,0,0,0,0,0,33,1,36
2,3,0,0,0,0,0,33,31,31
3,4,0,0,0,0,1,11,31,6
4,5,0,0,0,1,1,33,0,12


In [81]:
result.head()

Unnamed: 0,Año,Mes,Cliente,Marca1,Marca2,Marca3,MarcaInno1,MarcaInno2
0,2019,5,9,0,0,0,0,0
1,2019,5,10,0,0,0,0,0
2,2019,5,11,0,0,0,0,0
3,2019,5,12,0,1,0,0,0
4,2019,5,13,0,0,0,0,0


In [110]:
full_union = result.merge(cliente_info, on='Cliente')
full_union.head()

Unnamed: 0,Año,Mes,Cliente,Marca1,Marca2,Marca3,MarcaInno1,MarcaInno2,Regional2,Gerencia2,SubCanal2,Categoria,Nevera,fav1,fav2,fav3
0,2019,5,9,0,0,0,0,0,0,3,22,2,1,0,34,35
1,2019,6,9,0,0,0,0,0,0,3,22,2,1,0,34,35
2,2019,7,9,0,0,0,0,0,0,3,22,2,1,0,34,35
3,2019,8,9,0,0,0,0,0,0,3,22,2,1,0,34,35
4,2019,9,9,0,0,0,0,0,0,3,22,2,1,0,34,35


primer producto

In [148]:
primer_prod = full_union[['Año', 'Mes', 'Cliente', 'Marca1', 'Regional2', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera', 'fav1', 'fav2', 'fav3']]
primer_prod.head()

Unnamed: 0,Año,Mes,Cliente,Marca1,Regional2,Gerencia2,SubCanal2,Categoria,Nevera,fav1,fav2,fav3
0,2019,5,9,0,0,3,22,2,1,0,34,35
1,2019,6,9,0,0,3,22,2,1,0,34,35
2,2019,7,9,0,0,3,22,2,1,0,34,35
3,2019,8,9,0,0,3,22,2,1,0,34,35
4,2019,9,9,0,0,3,22,2,1,0,34,35


In [330]:
y = primer_prod['Marca1']
x = primer_prod[['Año', 'Mes', 'Cliente', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera', 'fav1', 'fav2', 'fav3']]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [331]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.quniform ('gamma', 0, 1, 0.01),
        'reg_alpha' : hp.quniform('reg_alpha', 0,10,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0, 1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': hp.quniform('n_estimators', 50, 200, 10),
        'learning_rate': hp.quniform('learning_rate', 0.1, 1, 0.01),
        'seed': 0
    }

In [332]:
def objective(space):
    XGB=XGBClassifier(n_estimators = int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = space['gamma'],
                      reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']), learning_rate=space['learning_rate'],
                      colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    XGB.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = XGB.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [333]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 50,
                        trials = trials)

SCORE:
0.9732701329163409
SCORE:
0.9736121970289289
SCORE:
0.9732701329163409
SCORE:
0.9732701329163409
SCORE:
0.9732701329163409
SCORE:
0.9741985926505082
SCORE:
0.9732701329163409
SCORE:
0.9733434323690383
SCORE:
0.9732701329163409
SCORE:
0.9732701329163409
SCORE:
0.9739542611415168
SCORE:
0.9732701329163409
SCORE:
0.9732701329163409
SCORE:
0.9739542611415168
SCORE:
0.9732701329163409
SCORE:
0.9732701329163409
SCORE:
0.9732701329163409
SCORE:
0.9732701329163409
SCORE:
0.9732701329163409
SCORE:
0.9732701329163409
SCORE:
0.9735633307271305
SCORE:
0.9739786942924159
SCORE:
0.9732701329163409
SCORE:
0.9732701329163409
SCORE:
0.9740031274433151
SCORE:
0.9737832290852229
SCORE:
0.9732701329163409
SCORE:
0.9738076622361219
SCORE:
0.9732701329163409
SCORE:
0.9732701329163409
SCORE:
0.9739053948397185
SCORE:
0.9732701329163409
SCORE:
0.9737832290852229
SCORE:
0.9736121970289289
SCORE:
0.9742474589523065
SCORE:
0.9732701329163409
SCORE:
0.9732456997654417
SCORE:
0.9740031274433151
SCORE:
0.973

In [334]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

The best hyperparameters are :  

{'colsample_bytree': 0.3667630798061273, 'gamma': 0.26, 'learning_rate': 0.8300000000000001, 'max_depth': 17.0, 'min_child_weight': 1.0, 'n_estimators': 90.0, 'reg_alpha': 2.0, 'reg_lambda': 0.902408278776433}


In [335]:
xgb1 = XGBClassifier(colsample_bytree=0.3667630798061273, gamma=0.26, learning_rate=0.8300000000000001, max_depth=17 , min_child_weight=1, n_estimators=90, reg_alpha=2, reg_lambda=0.902408278776433)
xgb1.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3667630798061273,
              gamma=0.26, learning_rate=0.8300000000000001, max_delta_step=0,
              max_depth=17, min_child_weight=1, missing=None, n_estimators=90,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=2, reg_lambda=0.902408278776433,
              scale_pos_weight=1, seed=None, silent=None, subsample=1,
              verbosity=1)

In [336]:
answer = xgb1.predict(X_test)
acc_score = accuracy_score(y_test, answer)
print(acc_score)
roc = roc_curve(y_test, answer)
print(roc)
auc_score = auc(roc[0], roc[1])
print(auc_score)

0.9736610633307271
(array([0.        , 0.00557313, 1.        ]), array([0.        , 0.21755027, 1.        ]), array([2, 1, 0]))
0.6059885728699147


segundo prod

In [297]:
segundo_prod = full_union[['Año', 'Mes', 'Cliente', 'Marca2', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera', 'fav1', 'fav2', 'fav3']]

y = segundo_prod['Marca2']
x = segundo_prod[['Año', 'Mes','Cliente', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera', 'fav1', 'fav2', 'fav3']]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [298]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 30,
                        trials = trials)
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

SCORE:
0.9568510555121188
SCORE:
0.9574863174354965
SCORE:
0.9566067240031274
SCORE:
0.956337959343237
SCORE:
0.9540656763096169
SCORE:
0.9562402267396404
SCORE:
0.956875488663018
SCORE:
0.9575596168881939
SCORE:
0.9564112587959344
SCORE:
0.9542855746677091
SCORE:
0.95653342455043
SCORE:
0.9572175527756059
SCORE:
0.9566067240031274
SCORE:
0.9548231039874903
SCORE:
0.9565578577013292
SCORE:
0.956191360437842
SCORE:
0.9553362001563722
SCORE:
0.9566311571540266
SCORE:
0.9540656763096169
SCORE:
0.9569732212666145
SCORE:
0.9564112587959344
SCORE:
0.9570709538702111
SCORE:
0.9591966379984362
SCORE:
0.9595387021110242
SCORE:
0.9590011727912432
SCORE:
0.958976739640344
SCORE:
0.9586102423768569
SCORE:
0.9587568412822518
SCORE:
0.9586102423768569
SCORE:
0.9556293979671618
100%|██████████| 30/30 [03:15<00:00,  6.53s/it, best loss: -0.9595387021110242]
The best hyperparameters are :  

{'colsample_bytree': 0.3216478669732097, 'gamma': 0.01, 'learning_rate': 1.0, 'max_depth': 16.0, 'min_child_weig

In [299]:
xgb2 = XGBClassifier(colsample_bytree=0.3216478669732097, gamma=0.01, learning_rate=0.1, max_depth=16 , min_child_weight=6, n_estimators=200, reg_alpha=0, reg_lambda=0.13284107259706027)
xgb2.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3216478669732097,
              gamma=0.01, learning_rate=0.1, max_delta_step=0, max_depth=16,
              min_child_weight=6, missing=None, n_estimators=200, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=0.13284107259706027, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, verbosity=1)

In [300]:
answer = xgb2.predict(X_test)
acc_score = accuracy_score(y_test, answer)
print(acc_score)
roc = roc_curve(y_test, answer)
print(roc)
auc_score = auc(roc[0], roc[1])
print(auc_score)

0.9614933541829555
(array([0.        , 0.00489142, 1.        ]), array([0.        , 0.26329787, 1.        ]), array([2, 1, 0]))
0.6292032283234602


producto tres

In [301]:
tercero_prod = full_union[['Año', 'Mes', 'Cliente', 'Marca3', 'Regional2', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera', 'fav1', 'fav2', 'fav3']]

y = tercero_prod['Marca3']
x = tercero_prod[['Año', 'Mes','Cliente', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera', 'fav1', 'fav2', 'fav3']]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)



In [302]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 30,
                        trials = trials)
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

SCORE:
0.7549599296325255
SCORE:
0.7653440187646599
SCORE:
0.7601886239249414
SCORE:
0.7624853401094606
SCORE:
0.7609460516028147
SCORE:
0.7689112587959344
SCORE:
0.752809812353401
SCORE:
0.7716966379984362
SCORE:
0.7676896012509773
SCORE:
0.7780003909304144
SCORE:
0.7555218921032056
SCORE:
0.7588203674745895
SCORE:
0.7652951524628616
SCORE:
0.75994429241595
SCORE:
0.7586737685691947
SCORE:
0.7663946442533229
SCORE:
0.7627785379202502
SCORE:
0.7660281469898358
SCORE:
0.7553752931978108
SCORE:
0.7613858483189992
SCORE:
0.7762412040656763
SCORE:
0.7772673964034402
SCORE:
0.758551602814699
SCORE:
0.7399091086786552
SCORE:
0.7763145035183737
SCORE:
0.7766565676309617
SCORE:
0.7581851055512119
SCORE:
0.7713545738858483
SCORE:
0.7739444878811571
SCORE:
0.7627052384675528
100%|██████████| 30/30 [04:35<00:00,  9.19s/it, best loss: -0.7780003909304144]
The best hyperparameters are :  

{'colsample_bytree': 0.8074219629301445, 'gamma': 0.64, 'learning_rate': 0.55, 'max_depth': 18.0, 'min_child_w

In [303]:
xgb3 = XGBClassifier(colsample_bytree=0.8074219629301445, gamma=0.64, learning_rate=0.55, max_depth=18 , min_child_weight=4, n_estimators=110, reg_alpha=1, reg_lambda=0.1643095431794116)
xgb3.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8074219629301445,
              gamma=0.64, learning_rate=0.55, max_delta_step=0, max_depth=18,
              min_child_weight=4, missing=None, n_estimators=110, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=1, reg_lambda=0.1643095431794116, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, verbosity=1)

In [304]:
answer = xgb3.predict(X_test)
acc_score = accuracy_score(y_test, answer)
print(acc_score)
roc = roc_curve(y_test, answer)
print(roc)
auc_score = auc(roc[0], roc[1])
print(auc_score)

0.81022771696638
(array([0.        , 0.10805793, 1.        ]), array([0.        , 0.59610792, 1.        ]), array([2, 1, 0]))
0.7440249944447848


producto cuatro

In [305]:
cuarto_prod = full_union[['Año', 'Mes', 'Cliente', 'MarcaInno1', 'Regional2', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera', 'fav1', 'fav2', 'fav3']]

y = cuarto_prod['MarcaInno1']
x = cuarto_prod[['Año', 'Mes','Cliente', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera', 'fav1', 'fav2', 'fav3']]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)



In [306]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 30,
                        trials = trials)
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

SCORE:
0.9708023846755277
SCORE:
0.9705824863174355
SCORE:
0.9707535183737295
SCORE:
0.9707290852228303
SCORE:
0.9702892885066459
SCORE:
0.9570953870211102
SCORE:
0.9570953870211102
SCORE:
0.9638878029710711
SCORE:
0.9570953870211102
SCORE:
0.9593921032056294
SCORE:
0.9570953870211102
SCORE:
0.9646696637998436
SCORE:
0.9709734167318217
SCORE:
0.9704358874120407
SCORE:
0.9703137216575449
SCORE:
0.9706557857701329
SCORE:
0.9593921032056294
SCORE:
0.965964620797498
SCORE:
0.9707046520719312
SCORE:
0.9705580531665363
SCORE:
0.9703625879593433
SCORE:
0.970851250977326
SCORE:
0.9709245504300235
SCORE:
0.9709978498827209
SCORE:
0.9570953870211102
SCORE:
0.9593921032056294
SCORE:
0.9709001172791243
SCORE:
0.9570953870211102
SCORE:
0.970851250977326
SCORE:
0.9710711493354183
100%|██████████| 30/30 [02:39<00:00,  5.31s/it, best loss: -0.9710711493354183]
The best hyperparameters are :  

{'colsample_bytree': 0.637032980303668, 'gamma': 0.88, 'learning_rate': 0.67, 'max_depth': 7.0, 'min_child_we

In [307]:
xgb4 = XGBClassifier(colsample_bytree=0.637032980303668, gamma=0.88, learning_rate=0.67, max_depth=7 , min_child_weight=3, n_estimators=190, reg_alpha=4, reg_lambda=0.4519111386667546)
xgb4.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.637032980303668,
              gamma=0.88, learning_rate=0.67, max_delta_step=0, max_depth=7,
              min_child_weight=3, missing=None, n_estimators=190, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=4, reg_lambda=0.4519111386667546, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, verbosity=1)

In [308]:
answer = xgb4.predict(X_test)
acc_score = accuracy_score(y_test, answer)
print(acc_score)
roc = roc_curve(y_test, answer)
print(roc)
auc_score = auc(roc[0], roc[1])
print(auc_score)

0.9706313526192337
(array([0.        , 0.00827121, 1.        ]), array([0. , 0.5, 1. ]), array([2, 1, 0]))
0.7458643929337282


producto cinco

In [309]:
cinco_prod = full_union[['Año', 'Mes', 'Cliente', 'MarcaInno2', 'Regional2', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera', 'fav1', 'fav2', 'fav3']]

y = cinco_prod['MarcaInno2']
x = cinco_prod[['Año', 'Mes','Cliente', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera', 'fav1', 'fav2', 'fav3']]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [310]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 30,
                        trials = trials)
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

SCORE:
0.9570953870211102
SCORE:
0.9570953870211102
SCORE:
0.9709734167318217
SCORE:
0.9593921032056294
SCORE:
0.9708268178264269
SCORE:
0.9707779515246286
SCORE:
0.9647918295543393
SCORE:
0.9709978498827209
SCORE:
0.9570953870211102
SCORE:
0.9570953870211102
SCORE:
0.9709489835809226
SCORE:
0.9593921032056294
SCORE:
0.9570953870211102
SCORE:
0.9570953870211102
SCORE:
0.9570953870211102
SCORE:
0.9593921032056294
SCORE:
0.970851250977326
SCORE:
0.9570953870211102
SCORE:
0.970851250977326
SCORE:
0.9570953870211102
SCORE:
0.9711200156372166
SCORE:
0.9709978498827209
SCORE:
0.9708756841282252
SCORE:
0.9707046520719312
SCORE:
0.9709001172791243
SCORE:
0.9709734167318217
SCORE:
0.9570953870211102
SCORE:
0.970851250977326
SCORE:
0.9707779515246286
SCORE:
0.9709734167318217
100%|██████████| 30/30 [02:13<00:00,  4.46s/it, best loss: -0.9711200156372166]
The best hyperparameters are :  

{'colsample_bytree': 0.9500298028057517, 'gamma': 0.99, 'learning_rate': 0.6, 'max_depth': 3.0, 'min_child_we

In [311]:
xgb5 = XGBClassifier(colsample_bytree=0.9500298028057517, gamma=0.99, learning_rate=0.6, max_depth=3 , min_child_weight=4, n_estimators=80, reg_alpha=10, reg_lambda=0.5106728652198309)
xgb5.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9500298028057517,
              gamma=0.99, learning_rate=0.6, max_delta_step=0, max_depth=3,
              min_child_weight=4, missing=None, n_estimators=80, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=10, reg_lambda=0.5106728652198309, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, verbosity=1)

In [312]:
answer = xgb5.predict(X_test)
acc_score = accuracy_score(y_test, answer)
print(acc_score)
roc = roc_curve(y_test, answer)
print(roc)
auc_score = auc(roc[0], roc[1])
print(auc_score)

0.9708023846755277
(array([0.        , 0.00479935, 1.        ]), array([0.        , 0.42653759, 1.        ]), array([2, 1, 0]))
0.7108691194747213


LLENAR

In [327]:
test = df_test[['Cliente']]
test = test.merge(cliente_info, on='Cliente')
test['Año'] = 2020
test['Mes'] = 10
test = test[['Año', 'Mes', 'Cliente', 'Gerencia2', 'SubCanal2', 'Categoria', 'Nevera', 'fav1', 'fav2', 'fav3']]

In [328]:
prob1 = xgb1.predict_proba(test)[:, 1]
prob2 = xgb2.predict_proba(test)[:, 1]
prob3 = xgb3.predict_proba(test)[:, 1]
prob4 = xgb4.predict_proba(test)[:, 1]
prob5 = xgb5.predict_proba(test)[:, 1]

test['Marca1'] = prob1
test['Marca2'] = prob2
test['Marca3'] = prob3
test['Marca_Inno1'] = prob4
test['Marca_Inno2'] = prob5

In [329]:
final = test[['Cliente', 'Marca1', 'Marca2', 'Marca3', 'Marca_Inno1', 'Marca_Inno2']]
final.columns = df_test.columns
final.to_csv('input4_clientes_test.csv', sep=',', encoding='utf-8', index=False)