In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

import feather
import warnings
import time
import sys
import datetime
import re
import gc

from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', 500)

from scipy.stats import mode
from scipy import stats
from sklearn import preprocessing


from IPython.core.display import display, HTML

In [2]:
all_transactions = feather.read_dataframe('all_transactions')

In [3]:
all_transactions['non_installments_purchases'] = (all_transactions.installments == 0) * all_transactions.purchase_amount

In [4]:
all_transactions.drop(['cat2_dum_nan'],axis=1,inplace=True)

In [5]:
def modeplus(x): return mode(x)[0]
def non_zero_mean(x): return get_non_zero(x).mean()
def non_zero_percent(x): return x.astype(np.bool_).mean()
def count_non_zeros(x): return x.astype(np.bool_).sum()
def active_days(x): return x.nunique()/x.ptp()

def top_quarter_quantile(x): return x.quantile(q=0.25,interpolation='lower')
def bottom_quarter_quantile(x): return x.quantile(q=0.75,interpolation='higher')

def table_ends(x): return np.sum([(x == -15).max(),(x == 0).max()*2])


In [6]:
def get_non_zero(x: pd.core.series.Series) -> pd.core.series.Series:
    """returns a copy of an array with zero elemnts removed"""
    return x.reset_index(drop=True)[x.nonzero()[0]]

def count_unique(x: pd.core.series.Series) -> pd.core.series.Series:
    """returns a reversed array of unique values' counts"""
    return np.unique(x,return_counts=True)[1][::-1]

In [7]:
"""payments per categories"""
cats_purchases = all_transactions[[x for x in all_transactions.columns.values if re.match('cat\d',x)]].copy()

for x in cats_purchases.columns:
    cats_purchases[x] = cats_purchases[x] * all_transactions.purchase_amount.values

def reformat(x): return re.match('(cat\d)_.+_(.)',x).group(1),re.match('(cat\d)_.+_(.)',x).group(2)
cats_purchases.columns = [f'{reformat(x)[0]}_{reformat(x)[1]}_purchases' for x in cats_purchases.columns.values]
cats_purchases['card_id'] = all_transactions.card_id

In [8]:
"""installment per categories"""
cats_installment = all_transactions[[x for x in all_transactions.columns.values if re.match('cat\d',x)]].copy()

for x in cats_installment.columns:
    cats_installment[x] = cats_installment[x] * all_transactions.installments.values

def reformat(x): return re.match('(cat\d)_.+_(.)',x).group(1),re.match('(cat\d)_.+_(.)',x).group(2)
cats_installment.columns = [f'{reformat(x)[0]}_{reformat(x)[1]}_installments' for x in cats_installment.columns.values]
cats_installment['card_id'] = all_transactions.card_id

In [9]:
"""from end to finnish payments"""
from_end_to_finnish_payments = pd.get_dummies(all_transactions.payment_from_end_to_begging)

for x in from_end_to_finnish_payments:
    from_end_to_finnish_payments[x] = from_end_to_finnish_payments[x] * all_transactions.purchase_amount.values
    
from_end_to_finnish_payments.columns = [f'{x}_month_payments' for x in from_end_to_finnish_payments.columns.values]
from_end_to_finnish_payments['card_id'] = all_transactions.card_id

In [10]:
"""from end to finnish installments"""
from_end_to_finnish_installments = pd.get_dummies(all_transactions.payment_from_end_to_begging)

for x in from_end_to_finnish_installments:
    from_end_to_finnish_installments[x] = from_end_to_finnish_installments[x] * all_transactions.installments.values
    
from_end_to_finnish_installments.columns = [f'{x}_month_installments' for x in from_end_to_finnish_installments.columns.values]
from_end_to_finnish_installments['card_id'] = all_transactions.card_id

In [11]:
summed_values = all_transactions.groupby(['card_id']).agg({'purchase_amount':'sum','installments':['sum',count_non_zeros]})
summed_values.columns = ['_'.join(x) for x in summed_values.columns.values]

In [12]:
del all_transactions

In [13]:
aggP = {x:['sum','mean','std'] for x in cats_purchases.columns.values if x!='card_id'}

In [14]:
cats_purchases_agged = cats_purchases.groupby(['card_id']).agg(aggP)

In [15]:
aggI = {x:['sum','mean','std',count_non_zeros] for x in cats_installment.columns.values if x!='card_id'}

In [16]:
cats_installments_agged = cats_installment.groupby(['card_id']).agg(aggI)

In [None]:
cats_installments_agged.columns = ['_'.join(x) for x in cats_installments_agged.columns.values]
cats_purchases_agged.columns = ['_'.join(x) for x in cats_purchases_agged.columns.values]

In [17]:
aggIm = {x:['sum','mean','std',count_non_zeros] for x in from_end_to_finnish_installments.columns.values if x!='card_id'}
aggPm = {x:['sum','mean','std'] for x in from_end_to_finnish_payments.columns.values if x!='card_id'}

In [18]:
fetfp = from_end_to_finnish_payments.groupby(['card_id']).agg(aggPm)
fetfi = from_end_to_finnish_installments.groupby(['card_id']).agg(aggIm)
fetfi.columns = ['_'.join(x) for x in fetfi.columns.values]
fetfp.columns = ['_'.join(x) for x in fetfp.columns.values]

In [33]:
cats_installments_agged.columns = ['_'.join(x) for x in cats_installments_agged.columns.values]
cats_purchases_agged.columns = ['_'.join(x) for x in cats_purchases_agged.columns.values]

In [34]:
fetfp.head()

Unnamed: 0_level_0,0_month_payments_sum,0_month_payments_mean,0_month_payments_std,1_month_payments_sum,1_month_payments_mean,1_month_payments_std,2_month_payments_sum,2_month_payments_mean,2_month_payments_std,3_month_payments_sum,3_month_payments_mean,3_month_payments_std,4_month_payments_sum,4_month_payments_mean,4_month_payments_std,5_month_payments_sum,5_month_payments_mean,5_month_payments_std,6_month_payments_sum,6_month_payments_mean,6_month_payments_std,7_month_payments_sum,7_month_payments_mean,7_month_payments_std,8_month_payments_sum,8_month_payments_mean,8_month_payments_std,9_month_payments_sum,9_month_payments_mean,9_month_payments_std,10_month_payments_sum,10_month_payments_mean,10_month_payments_std,11_month_payments_sum,11_month_payments_mean,11_month_payments_std,12_month_payments_sum,12_month_payments_mean,12_month_payments_std,13_month_payments_sum,13_month_payments_mean,13_month_payments_std,14_month_payments_sum,14_month_payments_mean,14_month_payments_std,15_month_payments_sum,15_month_payments_mean,15_month_payments_std
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1
C_ID_00007093c1,46.71476,0.309369,2.679171,0.0,0.0,0.0,297.496623,1.970176,6.747591,243.139529,1.610196,6.073657,300.201996,1.988093,6.807766,149.31421,0.988836,4.908215,405.769653,2.687216,7.574094,120.537067,0.798259,4.33302,445.971732,2.953455,8.604544,357.615352,2.368314,7.162694,471.313234,3.12128,8.285932,265.747517,1.759917,6.703376,294.913115,1.953067,6.695223,242.87371,1.608435,6.063382,106.655856,0.70633,4.312605,0.0,0.0,0.0
C_ID_0001238066,219.052161,1.470149,5.82355,412.447285,2.768103,7.767903,571.05992,3.832617,8.784594,644.18107,4.323363,9.53703,895.571226,6.010545,10.518479,545.733015,3.662638,8.60958,262.010594,1.75846,6.256089,46.972165,0.315249,2.713308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C_ID_0001506ef0,45.526464,0.669507,3.874646,159.532238,2.346062,7.984553,178.232151,2.621061,7.999497,429.287817,6.313056,10.681622,174.084533,2.560067,7.738458,81.278011,1.195265,5.738222,56.639771,0.832938,4.912572,57.826866,0.850395,5.039277,60.343807,0.887409,5.293081,67.966476,0.999507,4.687078,22.981717,0.337966,2.786943,0.0,0.0,0.0,184.552923,2.714014,7.488172,115.426701,1.697451,6.071005,45.828196,0.673944,3.900456,0.0,0.0,0.0
C_ID_0001793786,280.954648,1.137468,5.708537,646.749151,2.618418,9.111897,545.020247,2.20656,7.597485,1080.611318,4.374945,11.071592,911.561478,3.690532,9.54233,863.340701,3.495306,9.853186,1050.652491,4.253654,10.896715,140.917055,0.570514,4.562504,792.879604,3.210039,8.876783,471.539323,1.909066,7.695134,191.474695,0.775201,5.10861,64.14296,0.259688,2.971508,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C_ID_000183fdda,75.200389,0.485164,3.485466,188.891828,1.218657,5.244866,683.610376,4.41039,9.567334,764.188093,4.930246,11.129533,650.611966,4.197497,9.5088,236.402372,1.525177,6.466138,598.22637,3.859525,8.856074,520.849843,3.360322,8.309708,177.731598,1.146655,5.779884,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
purchases_CatsAndEts =  pd.merge(cats_purchases_agged,fetfp,on='card_id', how='left')
installments_CatsAndEts = pd.merge(cats_installments_agged,fetfi,on='card_id', how='left')

In [39]:
#feather.write_dataframe(purchases_CatsAndEts,'Purch_CatsAndEts')
#feather.write_dataframe(installments_CatsAndEts,'Inst_CatsAndEts')

In [50]:
purch_sum = [x for x in purchases_CatsAndEts.columns.values if re.search('_sum',x)]
inst_sum = [x for x in installments_CatsAndEts.columns.values if re.search('_sum',x)]
inst_nzeros = [x for x in installments_CatsAndEts.columns.values if re.search('_zeros',x)]

In [54]:
summed_values.head()

Unnamed: 0_level_0,purchase_amount_sum,installments_sum,installments_count_non_zeros
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C_ID_00007093c1,3748.264355,194,151
C_ID_0001238066,3597.027435,244,145
C_ID_0001506ef0,1679.507671,1,1
C_ID_0001793786,7039.843671,5,5
C_ID_000183fdda,3895.712835,285,150


In [56]:
for col in purchases_CatsAndEts[purch_sum]:
    purchases_CatsAndEts[col] = purchases_CatsAndEts[col].values / summed_values.purchase_amount_sum.values

In [57]:
for col in installments_CatsAndEts[inst_sum]:
    installments_CatsAndEts[col] = installments_CatsAndEts[col] / summed_values.installments_sum.values

In [58]:
for col in installments_CatsAndEts[inst_nzeros]:
    installments_CatsAndEts[col] = installments_CatsAndEts[col] / summed_values.installments_count_non_zeros.values

In [59]:
purch_and_inst_formatted = pd.merge(purchases_CatsAndEts,installments_CatsAndEts,on='card_id', how='left')

In [61]:
purch_and_inst_formatted.fillna(0,inplace=True)

In [62]:
feather.write_dataframe(purch_and_inst_formatted,'PandIformatted')