In [None]:
import sys
import pickle
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder

def main():
    data, output_path = sys.argv[1:]
    transactions = pd.read_csv(f'{data}/transactions.csv')
    transactions['transaction_dttm'] = pd.to_datetime(transactions['transaction_dttm'])
    transactions['hour']=transactions.transaction_dttm.dt.hour
    transactions['dow']=transactions.transaction_dttm.dt.dayofweek
    transactions['month']=transactions.transaction_dttm.dt.month
    transactions['date']=transactions.transaction_dttm.dt.date
    transactions['hour_minute']=transactions.transaction_dttm.dt.strftime('%H:%M')

    def is_not_rub(row):
        if row.currency_rk in [50,60]:
            return(1)
        else:
            return(0)
    transactions['not_rub']=transactions.apply(lambda row: is_not_rub(row),axis=1)
    foreign_currency=transactions[['user_id','not_rub']]
    foreign_currency=foreign_currency.drop_duplicates()
    foreign_currency.index=foreign_currency['user_id']
    foreign_currency=foreign_currency.drop('user_id', axis=1)
    agg_func={'not_rub':['max']}
    foreign_currency=foreign_currency.groupby('user_id').agg(agg_func)
    foreign_currency.columns=['not_rub']
    
    agg_func={'hour_minute':['nunique']}
    transactions_minutes=transactions.groupby(['user_id','date','dow']).agg(agg_func)
    transactions_minutes.columns=['nunique']
    transactions_minutes=pd.pivot_table(transactions_minutes,index='user_id',columns='dow',values='nunique',aggfunc='mean').fillna(0)
    transactions_minutes.columns=['avg_uniq_minutes_per_day_'+str(i)+'_tr'  for i in transactions_minutes.columns]

    agg_func={'transaction_amt':['sum','mean','max','min']}
    transactions_agg=transactions.groupby(['user_id','month']).agg(agg_func)
    transactions_agg=transactions_agg.groupby(['user_id']).agg('mean')
    transactions_agg.columns=['_'.join(col).strip()+'_by_month' for col in transactions_agg.columns.values]
    
    transactions['date_shift']=transactions.groupby(['user_id','mcc_code'])['date'].shift(1)
    transactions[['date','date_shift']] = transactions[['date','date_shift']].apply(pd.to_datetime) #if conversion required
    transactions['date_diff'] = (transactions['date'] - transactions['date_shift']).dt.days
    transactions_calc=transactions[['user_id','mcc_code','date','date_shift','date_diff']]
    transactions_calc=transactions_calc.drop_duplicates()
    transactions_calc=transactions_calc.dropna()
    transactions_calc=pd.pivot_table(transactions_calc,index='user_id',columns='mcc_code',values='date_diff',aggfunc='mean').fillna(0)
    transactions_calc.columns=[str(i)+'_avg_date_diff'+'_tr'  for i in transactions_calc.columns]
    
    tr_h=pd.pivot_table(transactions,index='user_id',columns='hour',values='transaction_amt',aggfunc='count').fillna(0)
    tr_h['summ']=tr_h.sum(axis=1)
    for i in tr_h.columns[:-1]:
        tr_h[i]/=tr_h['summ']
    tr_h.columns=['prc_h_tr_'+str(i) for i in tr_h.columns]
    tr_h=tr_h.rename(columns={'prc_h_tr_summ':'summ'})
    tr_h=tr_h.drop(['summ'],axis=1)

    tr_dow=pd.pivot_table(transactions,index='user_id',columns='dow',values='transaction_amt',aggfunc='count').fillna(0)
    tr_dow['summ']=tr_dow.sum(axis=1)
    for i in tr_dow.columns[:-1]:
        tr_dow[i]/=tr_dow['summ']
    tr_dow.columns=['prc_dow_tr_'+str(i) for i in tr_dow.columns]
    tr_dow=tr_dow.rename(columns={'prc_dow_tr_summ':'summ'})
    tr_dow=tr_dow.drop(['summ'],axis=1)

    bankclient_embed = transactions.pivot_table(index = 'user_id', 
                             values=['transaction_amt'],
                             columns=['mcc_code'],
                             aggfunc=['sum','mean', 'count']).fillna(0)

    bankclient_embed.columns = ['tr-'+f'{str(i[0])}-{str(i[2])}' for i in bankclient_embed.columns]
    
    clickstream = pd.read_csv(f'{data}/clickstream.csv')
    clickstream.timestamp=pd.to_datetime(clickstream.timestamp)
    clickstream['hour']=clickstream.timestamp.dt.hour
    clickstream['dow']=clickstream.timestamp.dt.dayofweek
    clickstream['date']=clickstream.timestamp.dt.date
    clickstream['hour_minute']=clickstream.timestamp.dt.strftime('%H:%M')

    agg_func={'new_uid':['nunique']}
    clickstream_unq_device=clickstream.groupby(['user_id']).agg(agg_func)
    clickstream_unq_device.columns=['cnt_unq_device_cl']
    
    agg_func={'hour_minute':['nunique']}
    clickstream_minutes=clickstream.groupby(['user_id','date','dow']).agg(agg_func)
    clickstream_minutes.columns=['nunique']
    clickstream_minutes=pd.pivot_table(clickstream_minutes,index='user_id',columns='dow',values='nunique',aggfunc='mean').fillna(0)
    clickstream_minutes.columns=['avg_uniq_minutes_per_day_'+str(i)+'_cl'  for i in clickstream_minutes.columns]

    clickstream['date_shift']=clickstream.groupby(['user_id','cat_id'])['date'].shift(1)
    clickstream[['date','date_shift']] = clickstream[['date','date_shift']].apply(pd.to_datetime) #if conversion required
    clickstream['date_diff'] = (clickstream['date'] - clickstream['date_shift']).dt.days
    clickstream_calc=clickstream[['user_id','cat_id','date','date_shift','date_diff']]
    clickstream_calc=clickstream_calc.drop_duplicates()
    clickstream_calc=clickstream_calc.dropna()
    clickstream_calc=pd.pivot_table(clickstream_calc,index='user_id',columns='cat_id',values='date_diff',aggfunc='mean').fillna(0)
    clickstream_calc.columns=[str(i)+'_avg_date_diff'+'_cl'  for i in clickstream_calc.columns]

    cl_h=pd.pivot_table(clickstream,index='user_id',columns='hour',values='timestamp',aggfunc='count').fillna(0)
    cl_h['summ']=cl_h.sum(axis=1)
    for i in cl_h.columns[:-1]:
        cl_h[i]/=cl_h['summ']
    cl_h.columns=['prc_h_cl_'+str(i) for i in cl_h.columns]
    cl_h=cl_h.rename(columns={'prc_h_cl_summ':'summ'})
    cl_h=cl_h.drop(['summ'],axis=1)
    
    cl_dow=pd.pivot_table(clickstream,index='user_id',columns='dow',values='timestamp',aggfunc='count').fillna(0)
    cl_dow['summ']=cl_dow.sum(axis=1)
    for i in cl_dow.columns[:-1]:
        cl_dow[i]/=cl_dow['summ']
    cl_dow.columns=['prc_dow_cl_'+str(i) for i in cl_dow.columns]
    cl_dow=cl_dow.rename(columns={'prc_dow_cl_summ':'summ'})
    cl_dow=cl_dow.drop(['summ'],axis=1)

    clickstream_embed = clickstream.pivot_table(index = 'user_id', 
                             values=['timestamp'],
                             columns=['cat_id'],
                             aggfunc=['count']).fillna(0)

    clickstream_embed.columns = ['cl-'+f'{str(i[0])}-{str(i[2])}' for i in clickstream_embed.columns]

    transactions=transactions.sort_values(by=['user_id','transaction_dttm'],ascending=True)
    clickstream=clickstream.sort_values(by=['user_id','timestamp'],ascending=True)
    transactions_wv=transactions
    clickstream_wv=clickstream

    tr_vectors_df=pd.read_csv('tr_vectors.csv')
    cl_vectors_df=pd.read_csv('cl_vectors.csv')
    
    tr_vectors_df.set_index('user_id', inplace=True)
    cl_vectors_df.set_index('user_id', inplace=True)
    
    del transactions
    del clickstream
    
    list_of_rtk = list(clickstream_embed.index.unique())
    list_of_bank= list(bankclient_embed.index.unique())
    
    submission = pd.DataFrame(list_of_bank, columns=['bank'])
    submission['rtk'] = submission['bank'].apply(lambda x: list_of_rtk)

    model = CatBoostClassifier()
    model.load_model('data_fusion_matching_6.cbm',  format='cbm')
    
    full_list_of_features=model.feature_names_
    
    submission_ready = []

    batch_size = 200
    num_of_batches = int((len(list_of_bank))/batch_size)+1

    for i in range(num_of_batches):
        bank_ids = list_of_bank[(i*batch_size):((i+1)*batch_size)]
        if len(bank_ids) != 0:
            part_of_submit = submission[submission['bank'].isin(bank_ids)].explode('rtk')
            part_of_submit = part_of_submit.merge(bankclient_embed, how='left', left_on='bank', right_index=True)\
               .merge(clickstream_embed, how='left', left_on='rtk', right_index=True)\
               .merge(tr_dow, how='left', left_on='bank', right_index=True)\
               .merge(tr_h, how='left', left_on='bank', right_index=True)\
               .merge(transactions_agg, how='left', left_on='bank', right_index=True)\
               .merge(transactions_minutes, how='left', left_on='bank', right_index=True)\
               .merge(transactions_calc, how='left', left_on='bank', right_index=True)\
               .merge(foreign_currency, how='left', left_on='bank', right_index=True)\
               .merge(cl_dow, how='left', left_on='rtk', right_index=True)\
               .merge(cl_h, how='left', left_on='rtk', right_index=True)\
               .merge(clickstream_minutes, how='left', left_on='rtk', right_index=True)\
               .merge(clickstream_calc, how='left', left_on='bank', right_index=True)\
               .merge(clickstream_unq_device, how='left', left_on='rtk', right_index=True)\
               .merge(tr_vectors_df, how='left', left_on='bank', right_index=True)\
               .merge(cl_vectors_df, how='left', left_on='rtk', right_index=True)\
               .fillna(0)
        
            for i in full_list_of_features:
                if i not in part_of_submit.columns:
                    part_of_submit[i] = 0
            

            part_of_submit['predicts'] = model.predict_proba(part_of_submit[full_list_of_features])[:,1]
            part_of_submit = part_of_submit[['bank', 'rtk', 'predicts']]

            zeros_part = pd.DataFrame(bank_ids, columns=['bank'])
            zeros_part['rtk'] = 0.
            zeros_part['predicts'] = 0.8
            
            part_of_submit = pd.concat((part_of_submit, zeros_part))

            part_of_submit = part_of_submit.sort_values(by=['bank', 'predicts'], ascending=False).reset_index(drop=True)
            part_of_submit = part_of_submit.pivot_table(index='bank', values='rtk', aggfunc=list)
            part_of_submit['rtk'] = part_of_submit['rtk'].apply(lambda x: x[:100])
            part_of_submit['bank'] = part_of_submit.index
            part_of_submit = part_of_submit[['bank', 'rtk']]
            submission_ready.extend(part_of_submit.values)
    
    submission_final = np.array(submission_ready, dtype=object)

    print(submission_final.shape)
    np.savez(output_path, submission_final)

if __name__ == "__main__":
    main()