In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

import feather
import warnings
import time
import sys
import datetime
import re
import gc

from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', 500)

from scipy.stats import mode
from scipy import stats
from sklearn import preprocessing


from IPython.core.display import display, HTML

In [2]:
all_transactions = feather.read_dataframe('all_transactions')

In [27]:
all_transactions['authorized_no'] = (all_transactions['authorized_flag'] == 0)

In [28]:
all_transactions['last_months_purchase'] = (all_transactions['payment_from_end_to_begging'] == 0)

In [29]:
purch_per_months = all_transactions.groupby(['card_id','payment_from_end_to_begging']).agg({'purchase_date':'count','authorized_no':'sum', 'authorized_flag':'sum', 'last_months_purchase':'sum'})

In [34]:
last_months_purchase_percent = purch_per_months.groupby(['card_id']).agg({'purchase_date':'sum','last_months_purchase':'max'})

In [42]:
last_months_purchase_percent['last_month_percent'] = last_months_purchase_percent['last_months_purchase'] / last_months_purchase_percent['purchase_date']

In [44]:
del last_months_purchase_percent['purchase_date']


Unnamed: 0_level_0,last_months_purchase,last_month_percent
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1
C_ID_00007093c1,2.0,0.013245
C_ID_0001238066,9.0,0.060403
C_ID_0001506ef0,2.0,0.029412
C_ID_0001793786,10.0,0.040486
C_ID_000183fdda,3.0,0.019355
C_ID_00024e244b,2.0,0.028571
C_ID_0002709b5a,1.0,0.013158
C_ID_00027503e2,4.0,0.095238
C_ID_000298032a,1.0,0.032258
C_ID_0002ba3c2e,1.0,0.013699


In [55]:
last_months_purchase_percent.reset_index(inplace=True)

In [38]:
purchases_description = purch_per_months.groupby(['card_id']).agg({'purchase_date':['max','mean','min','std','skew','sum'],
                                                                   'authorized_no':['max','mean','min','std','skew'],
                                                                   'authorized_flag':['max','mean','min','std','skew'],
                                                                  'last_months_purchase':'max'})

In [46]:
col_dict = {'purchase_date':'all_purchases',
            'authorized_no':'non_author_purchases',
            'authorized_flag':'author_purchases',
            'last_months_purchase':'last_months_purchase'}

In [49]:
purchases_description.columns = [f'{col_dict[x[0]]}_{x[1]}' for x in purchases_description]

In [50]:
purchases_description.head()

Unnamed: 0_level_0,all_purchases_max,all_purchases_mean,all_purchases_min,all_purchases_std,all_purchases_skew,all_purchases_sum,non_author_purchases_max,non_author_purchases_mean,non_author_purchases_min,non_author_purchases_std,non_author_purchases_skew,author_purchases_max,author_purchases_mean,author_purchases_min,author_purchases_std,author_purchases_skew,last_months_purchase_max
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
C_ID_00007093c1,19,10.785714,2,5.176553,-0.117048,151,6.0,2.5,0.0,1.82925,0.351881,14,8.285714,2,3.851644,0.019424,2.0
C_ID_0001238066,37,18.625,2,11.147549,0.125129,149,2.0,0.375,0.0,0.744024,1.95103,35,18.25,2,10.620062,-0.013716,9.0
C_ID_0001506ef0,18,4.857143,1,4.435125,2.196911,68,2.0,0.285714,0.0,0.61125,2.165284,17,4.571429,1,4.237457,2.166954,2.0
C_ID_0001793786,38,20.583333,2,13.069453,-0.088327,247,8.0,2.25,0.0,2.70101,0.878176,35,18.333333,2,11.324417,-0.118318,10.0
C_ID_000183fdda,28,17.222222,3,10.449615,-0.263454,155,6.0,0.777778,0.0,1.986063,2.854399,28,16.444444,0,11.314936,-0.369485,3.0


In [53]:
for x in [x for x in purchases_description if re.search('max|mean|min',x)]:
    purchases_description[x+'_percent'] = purchases_description[x] / purchases_description['all_purchases_sum']

In [56]:
purchases_description.reset_index(inplace=True)

In [58]:
purchases_description = pd.merge(purchases_description,last_months_purchase_percent,on='card_id',how='left')

In [59]:
purchases_description.head()

Unnamed: 0,card_id,all_purchases_max,all_purchases_mean,all_purchases_min,all_purchases_std,all_purchases_skew,all_purchases_sum,non_author_purchases_max,non_author_purchases_mean,non_author_purchases_min,non_author_purchases_std,non_author_purchases_skew,author_purchases_max,author_purchases_mean,author_purchases_min,author_purchases_std,author_purchases_skew,last_months_purchase_max,all_purchases_max_percent,all_purchases_mean_percent,all_purchases_min_percent,non_author_purchases_max_percent,non_author_purchases_mean_percent,non_author_purchases_min_percent,author_purchases_max_percent,author_purchases_mean_percent,author_purchases_min_percent,last_months_purchase_max_percent,last_months_purchase,last_month_percent
0,C_ID_00007093c1,19,10.785714,2,5.176553,-0.117048,151,6.0,2.5,0.0,1.82925,0.351881,14,8.285714,2,3.851644,0.019424,2.0,0.125828,0.071429,0.013245,0.039735,0.016556,0.0,0.092715,0.054872,0.013245,0.013245,2.0,0.013245
1,C_ID_0001238066,37,18.625,2,11.147549,0.125129,149,2.0,0.375,0.0,0.744024,1.95103,35,18.25,2,10.620062,-0.013716,9.0,0.248322,0.125,0.013423,0.013423,0.002517,0.0,0.234899,0.122483,0.013423,0.060403,9.0,0.060403
2,C_ID_0001506ef0,18,4.857143,1,4.435125,2.196911,68,2.0,0.285714,0.0,0.61125,2.165284,17,4.571429,1,4.237457,2.166954,2.0,0.264706,0.071429,0.014706,0.029412,0.004202,0.0,0.25,0.067227,0.014706,0.029412,2.0,0.029412
3,C_ID_0001793786,38,20.583333,2,13.069453,-0.088327,247,8.0,2.25,0.0,2.70101,0.878176,35,18.333333,2,11.324417,-0.118318,10.0,0.153846,0.083333,0.008097,0.032389,0.009109,0.0,0.1417,0.074224,0.008097,0.040486,10.0,0.040486
4,C_ID_000183fdda,28,17.222222,3,10.449615,-0.263454,155,6.0,0.777778,0.0,1.986063,2.854399,28,16.444444,0,11.314936,-0.369485,3.0,0.180645,0.111111,0.019355,0.03871,0.005018,0.0,0.180645,0.106093,0.0,0.019355,3.0,0.019355


In [71]:
len(re.search('([1-9|a-z](.*))','C_ID_000183fdda').group(0))

7

In [79]:
purchases_description['card_id_lenght'] = purchases_description['card_id'].apply(lambda x: len(re.search('([1-9|a-z](.*))',x).group(0)))

In [81]:
feather.write_dataframe(purchases_description,'all_purchases_and_last_purch')