In [18]:
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
import arrow
from collections import Counter, defaultdict, namedtuple
from itertools import chain
from keras.layers import Input, Dense
from keras.models import Model
from sklearn.base import TransformerMixin, BaseEstimator

In [19]:
class Selector(BaseEstimator, TransformerMixin):
    
    """
    pick a specific column of data frame X for further use within a pipeline
    """
    
    def fit(self, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        return X[[w for w in chain(transform_params['col'])]]
    
class AuthorizationFeatures(BaseEstimator, TransformerMixin):
    
    """
    have this card even been unauthorised? returns a data frame with card_id as index and a single column called ever_declined
    """
    
    def fit(self, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        return X[['card_id', 'authorized_flag']].groupby(['card_id']).sum().iloc[:,0].apply(lambda x: 1 if 'n' in x.lower() else 0).to_frame(name='ever_declined')
        
class CategoricalFeatures(BaseEstimator, TransformerMixin):
    
    """
    create binary features from categorical
    """
    
    def fit(self, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        return pd.get_dummies(X, prefix={c: f'cat_{c}' for c in X.columns if c != 'card_id'}, columns=[c for c in X.columns if c != 'card_id']).groupby('card_id').sum()


class PurchaseDateFeatures(BaseEstimator, TransformerMixin):
    
    """
    some possibly important features related to the purchase date
    """
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def get_date_features(self, dt):
        
        """
        extract potentially useful fetures from a purchase date string
        """
        
        DateFeatures = namedtuple('DateFeatures', 'month weekday mall_hrs buss_hrs')
        
        mall_hours = (10,22)
        business_hours = (8,18)
        
        _dt = arrow.get(dt)
        
        hr = _dt.hour
        
        out = DateFeatures(month=_dt.month, weekday=_dt.weekday(), 
                               mall_hrs = 1 if mall_hours[0] <= hr <= mall_hours[1] else 0,
                                  buss_hrs = 1 if business_hours[0] <= hr <= business_hours[1] else 0)
        
        return out
    
    def transform(self, X, **transform_params):
                                   
        return pd.DataFrame(X['purchase_date'].apply(lambda _: self.get_date_features(_)).tolist())
    
class Elo:
    
    def __init__(self):
        
        self.hist_trans = pd.read_csv('data/historical_transactions.csv.gz')
        print(f'historical transactions: {len(self.hist_trans):,} rows / {len(self.hist_trans.card_id.unique()):,} cards')
        self.new_trans = pd.read_csv('data/new_merchant_transactions.csv.gz')
        print(f'new transactions: {len(self.new_trans):,} rows')
        self.merchants = pd.read_csv('data/merchants.csv.gz')
        print(f'merchants: {len(self.merchants):,} rows')
        self.train = pd.read_csv('data/train.csv.gz')
        print(f'train: {len(self.train):,} rows')
        # categorical columns
        self.CAT = {col: col + '_cat' for col in 'authorized_flag city_id category_1 category_2 category_3 merchant_category_id state_id subsector_id'.split()}

In [20]:
if __name__ == '__main__':
    
    elo = Elo()

historical transactions: 29,112,361 rows / 325,540 cards
new transactions: 1,963,031 rows
merchants: 334,696 rows
train: 201,917 rows


In [21]:
cf = CategoricalFeatures()

In [22]:
d = elo.hist_trans.iloc[:50,:]

s = Selector()
s.fit(d)
gg = s.transform(d, col=['card_id', 'city_id', 'state_id'])

In [23]:
gg.head()

Unnamed: 0,card_id,city_id,state_id
0,C_ID_4e6213e9bc,88,16
1,C_ID_4e6213e9bc,88,16
2,C_ID_4e6213e9bc,88,16
3,C_ID_4e6213e9bc,88,16
4,C_ID_4e6213e9bc,88,16


In [24]:
cf.fit(gg)
cf.transform(gg)

Unnamed: 0_level_0,cat_city_id_-1,cat_city_id_3,cat_city_id_69,cat_city_id_88,cat_city_id_233,cat_city_id_333,cat_state_id_-1,cat_state_id_9,cat_state_id_16
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
C_ID_4e6213e9bc,3,4,2,38,1,2,3,5,42


In [25]:
cf.transform(gg).columns

Index(['cat_city_id_-1', 'cat_city_id_3', 'cat_city_id_69', 'cat_city_id_88',
       'cat_city_id_233', 'cat_city_id_333', 'cat_state_id_-1',
       'cat_state_id_9', 'cat_state_id_16'],
      dtype='object')