In [5]:
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
import arrow
from collections import Counter, defaultdict, namedtuple
from itertools import chain
from keras.layers import Input, Dense
from keras.models import Model
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.linear_model import LogisticRegression, BayesianRidge

In [6]:
class Selector(BaseEstimator, TransformerMixin):
    
    """
    pick a specific colums of data frame X for further use within a pipeline
    """
    
    def __init__(self, cols):
        
        self.cols = cols
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        # note that transform_params is expected to have "cols", e.g. cols=['card_id', 'city_id', 'state_id']
        return X[[w for w in chain(self.cols)]]
    
class AuthorizationFeatures(BaseEstimator, TransformerMixin):
    
    """
    have this card even been unauthorised? returns a data frame with card_id as index and a single column called ever_declined
    """
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[['card_id', 'authorized_flag']] \
                    .groupby(['card_id']).sum().iloc[:,0].apply(lambda x: 1 if 'n' in x.lower() else 0).to_frame(name='ever_declined')
        
class CategoricalFeatures(BaseEstimator, TransformerMixin):
    
    """
    create binary features from categorical
    """
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return pd.get_dummies(X, prefix={c: f'cat_{c}' for c in X.columns if c != 'card_id'}, 
                                  columns=[c for c in X.columns if c != 'card_id']).groupby('card_id').sum()


class PurchaseDateFeatures(BaseEstimator, TransformerMixin):
    
    """
    some possibly important features related to the purchase date
    """
    
    def fit(self, X, y=None):
        return self
    
    def get_date_features(self, dt):
        
        """
        extract potentially useful fetures from a purchase date string
        """
        
        DateFeatures = namedtuple('DateFeatures', 'month weekday mall_hrs buss_hrs')
        
        mall_hours = (10,22)
        business_hours = (8,18)
        
        _dt = arrow.get(dt)
        
        hr = _dt.hour
        
        # the features below are per transation while we'll need to switch to per card
        out = DateFeatures(month=_dt.month, weekday=_dt.weekday(), 
                               mall_hrs = 1 if mall_hours[0] <= hr <= mall_hours[1] else 0,
                                  buss_hrs = 1 if business_hours[0] <= hr <= business_hours[1] else 0)
        
        return out
    
    def transform(self, X):
        
        _d = pd.concat([X['card_id'], pd.DataFrame(X['purchase_date'] \
                                                   .apply(lambda _: self.get_date_features(_)).tolist()).set_index(X.index)], axis=1)
        
        return _d[['card_id', 'month', 'weekday']] \
                    .groupby(['card_id', 'month']) \
                    .count() \
                    .reset_index() \
                    .set_index('card_id') \
                    .pivot(columns='month', values='weekday') \
                    .fillna(0) \
                    .rename(columns=lambda x: '_'.join(['ntrans', arrow.get(str(x), 'M').format('MMM').lower()]))
    
class Elo:
    
    def __init__(self, n=10000):
        
        self.hist_trans = pd.read_csv('data/historical_transactions.csv.gz')
        print(f'historical transactions: {len(self.hist_trans):,} rows / {len(self.hist_trans.card_id.unique()):,} cards')
        self.new_trans = pd.read_csv('data/new_merchant_transactions.csv.gz')
        print(f'new transactions: {len(self.new_trans):,} rows')
        self.merchants = pd.read_csv('data/merchants.csv.gz')
        print(f'merchants: {len(self.merchants):,} rows')
        self.train = pd.read_csv('data/train.csv.gz')
        print(f'train: {len(self.train):,} rows')
        # categorical columns
        self.CAT = {col: col + '_cat' for col in 'authorized_flag city_id category_1 category_2 category_3 merchant_category_id state_id subsector_id'.split()}
        # take only a sample of the train set
        self.train = self.train.sample(n)
        # historical transactions only for the cards in the training set 
        self.hist_trans = self.hist_trans[self.hist_trans.card_id.isin(self.train.card_id)]
        
        self.train1 = self.train.join(self.hist_trans.set_index('card_id'), on='card_id', how='inner')

In [7]:
if __name__ == '__main__':
    
    elo = Elo()
    
    X_train = elo.train1.drop('target', axis=1)
    y_train = elo.train['target']   
    
    estimator = make_pipeline(make_union(AuthorizationFeatures(), 
                                         make_pipeline(Selector(cols=['card_id', 'city_id', 'state_id']), CategoricalFeatures()), 
                                         PurchaseDateFeatures()), 
                              BayesianRidge())
    
    estimator.fit(X_train, y_train)
    
    mse = mean_squared_error(y_train, estimator.predict(X_train))
    
    print(f'accuracy at training is {mse}')

historical transactions: 29,112,361 rows / 325,540 cards
new transactions: 1,963,031 rows
merchants: 334,696 rows
train: 201,917 rows
accuracy at training is 15.096625736188283
