In [1]:
import pandas as pd
import json

In [2]:
df_train = pd.read_csv('train_data_cluster.csv')
df_val = pd.read_csv('val_data_cluster.csv')

In [3]:
df_train.rename(columns = {'installments': 'product'}, inplace = True)
df_val.rename(columns = {'installments': 'product'}, inplace = True)

In [4]:
df_train.head()

Unnamed: 0,id,vintage,internal_feature_0,internal_feature_1,internal_feature_2,internal_feature_3,internal_feature_4,internal_feature_5,internal_feature_6,internal_feature_7,...,bureau_feature_17,bureau_feature_18,bureau_feature_19,bureau_feature_20,bureau_feature_21,product,is_loan,is_default,relevance_score,cluster
0,10751409,2021-03-01,WEB,0.2,0.897597,L5,REGISTERED_PRIVATE,DEBT,0.039971,0.034312,...,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,12,1,0,2,6
1,9208477,2020-02-01,serasa-ecred,0.115813,0.592476,L8,REGISTERED_PRIVATE,DEBT,0.064072,0.046405,...,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,12,1,0,2,1
2,10219088,2020-11-01,FinanZero,0.106667,0.626959,L8,REGISTERED_PRIVATE,OTHERS,0.139974,0.037224,...,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,18,1,1,1,1
3,11082451,2021-06-01,serasa-ecred,0.133333,0.92372,L3,REGISTERED_PRIVATE,DEBT,0.049971,0.001717,...,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,12,1,0,2,1
4,9097185,2020-01-01,WEB,0.133333,0.317659,L11,REGISTERED_PRIVATE,DEBT,0.199976,0.122558,...,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,18,1,0,2,6


In [5]:
def group_risk(x):
    if x in ['L1', 'L2', 'L3', 'L4']:
        return 'A'
    elif x in ['L5', 'L6', 'L7']:
        return 'B'
    elif x in ['L8', 'L9', 'L10']:
        return 'C'
    else:
        return 'D'
    
df_train['group'] = df_train['internal_feature_3'].apply(lambda x: group_risk(x))
df_val['group'] = df_val['internal_feature_3'].apply(lambda x: group_risk(x))

In [6]:
df_train['combination'] = df_train[['group', 'cluster']].apply(lambda x: f'{x[0]}{x[1]}', axis = 1)
df_val['combination'] = df_val[['group', 'cluster']].apply(lambda x: f'{x[0]}{x[1]}', axis = 1)

In [7]:
df_train.head()

Unnamed: 0,id,vintage,internal_feature_0,internal_feature_1,internal_feature_2,internal_feature_3,internal_feature_4,internal_feature_5,internal_feature_6,internal_feature_7,...,bureau_feature_19,bureau_feature_20,bureau_feature_21,product,is_loan,is_default,relevance_score,cluster,group,combination
0,10751409,2021-03-01,WEB,0.2,0.897597,L5,REGISTERED_PRIVATE,DEBT,0.039971,0.034312,...,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,12,1,0,2,6,B,B6
1,9208477,2020-02-01,serasa-ecred,0.115813,0.592476,L8,REGISTERED_PRIVATE,DEBT,0.064072,0.046405,...,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,12,1,0,2,1,C,C1
2,10219088,2020-11-01,FinanZero,0.106667,0.626959,L8,REGISTERED_PRIVATE,OTHERS,0.139974,0.037224,...,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,18,1,1,1,1,C,C1
3,11082451,2021-06-01,serasa-ecred,0.133333,0.92372,L3,REGISTERED_PRIVATE,DEBT,0.049971,0.001717,...,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,12,1,0,2,1,A,A1
4,9097185,2020-01-01,WEB,0.133333,0.317659,L11,REGISTERED_PRIVATE,DEBT,0.199976,0.122558,...,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,18,1,0,2,6,D,D6


In [8]:
df_train['combination'].value_counts().reset_index().sort_values('index')

Unnamed: 0,index,combination
8,A0,1449
7,A1,1732
14,A2,780
6,A3,2010
12,A4,958
9,A5,1377
2,A6,2970
15,B0,706
1,B1,3645
10,B2,1049


In [9]:
def adjust_combination(x):
    group_D = ['D0', 'D2', 'D3', 'D4', 'D5', 'D6']
    
    if x in group_D:
        return 'D99'
    else:
        return x

df_train['combination'] = df_train['combination'].apply(lambda x: adjust_combination(x))
df_val['combination'] = df_val['combination'].apply(lambda x: adjust_combination(x))

In [10]:
df_train.head()

Unnamed: 0,id,vintage,internal_feature_0,internal_feature_1,internal_feature_2,internal_feature_3,internal_feature_4,internal_feature_5,internal_feature_6,internal_feature_7,...,bureau_feature_19,bureau_feature_20,bureau_feature_21,product,is_loan,is_default,relevance_score,cluster,group,combination
0,10751409,2021-03-01,WEB,0.2,0.897597,L5,REGISTERED_PRIVATE,DEBT,0.039971,0.034312,...,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,12,1,0,2,6,B,B6
1,9208477,2020-02-01,serasa-ecred,0.115813,0.592476,L8,REGISTERED_PRIVATE,DEBT,0.064072,0.046405,...,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,12,1,0,2,1,C,C1
2,10219088,2020-11-01,FinanZero,0.106667,0.626959,L8,REGISTERED_PRIVATE,OTHERS,0.139974,0.037224,...,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,18,1,1,1,1,C,C1
3,11082451,2021-06-01,serasa-ecred,0.133333,0.92372,L3,REGISTERED_PRIVATE,DEBT,0.049971,0.001717,...,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,12,1,0,2,1,A,A1
4,9097185,2020-01-01,WEB,0.133333,0.317659,L11,REGISTERED_PRIVATE,DEBT,0.199976,0.122558,...,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,NAO_CONSTAM_OCORRENCIAS,18,1,0,2,6,D,D99


In [11]:
df_train.to_csv('train_data_adjusted_combination.csv', index = False)
df_val.to_csv('val_data_adjusted_combination.csv', index = False)

Current Method - static product offering

In [12]:
order_by_combination_current = pd.DataFrame(columns = ['combination', 'product_1', 'product_2', 'product_3', 'product_4', 'product_5'])

for j, combination in enumerate(df_train['combination'].unique()):
    order_by_combination_current.loc[j, 'combination'] = combination
    for i, product in enumerate([12, 18, 24, 30, 36]):
        order_by_combination_current.loc[order_by_combination_current['combination'] == combination,f'product_{i+1}'] = product

In [13]:
methodCurrent = order_by_combination_current.sort_values('combination').set_index('combination').to_dict('index')

In [14]:
with open('methodCurrentResults.json', 'w') as file:
    json.dump(methodCurrent, file)

Proposed Method 1 - disregard selection frequency (explore highly performing products)

In [15]:
score_by_combination_1 = []
order_by_combination_1 = pd.DataFrame(columns = ['combination', 'product_1', 'product_2', 'product_3', 'product_4', 'product_5'])

for i, combination in enumerate(df_train['combination'].unique()):
    temp = df_train[df_train['combination'] == combination].groupby('product')['relevance_score'].mean().sort_values(ascending = False).reset_index()
    score_by_combination_1.append((combination, temp))
    order_by_combination_1.loc[i, 'combination'] = combination
    for i in range(1, 6):
        order_by_combination_1.loc[order_by_combination_1['combination'] == combination,f'product_{i}'] = temp['product'][i-1]

In [16]:
method1 = order_by_combination_1.sort_values('combination').set_index('combination').to_dict('index')

In [17]:
with open('method1Results.json', 'w') as file:
    json.dump(method1, file)

Proposed Method 2 - consider selection frequency (exploit highly selected products)

In [18]:
score_by_combination_2 = []
order_by_combination_2 = pd.DataFrame(columns = ['combination', 'product_1', 'product_2', 'product_3', 'product_4', 'product_5'])

for i, combination in enumerate(df_train['combination'].unique()):
    temp = df_train[df_train['combination'] == combination] \
            .pivot_table(index = 'id', columns = 'product', values = 'relevance_score', aggfunc = 'sum') \
            .fillna(-1) \
            .apply(lambda x: x+1)  \
            .mean() \
            .sort_values(ascending = False) \
            .reset_index() \
            .rename(columns = {0: 'relevance_score'})
    score_by_combination_2.append((combination, temp))
    order_by_combination_2.loc[i, 'combination'] = combination
    for i in range(1, 6):
        order_by_combination_2.loc[order_by_combination_2['combination'] == combination,f'product_{i}'] = temp['product'][i-1]

In [19]:
method2 = order_by_combination_2.sort_values('combination').set_index('combination').to_dict('index')

In [20]:
with open('method2Results.json', 'w') as file:
    json.dump(method2, file)