In [1]:
import os
import numpy as np
import random as rn
import datetime
import gc;

import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 300)

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error, log_loss
import lightgbm as lgb

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
plt.style.use('ggplot')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# libs for FFM embedding (absent in final model)
import tensorflow as tf
from keras.layers.normalization import BatchNormalization
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, Dense, Flatten, Concatenate, Dot, Reshape, Add, Subtract
from keras import objectives
from keras import backend as K
from keras import regularizers 
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2

Using TensorFlow backend.


In [2]:
def init_seeds(seed):
    
    os.environ['PYTHONHASHSEED'] = '0'
    
    np.random.seed(seed)
    rn.seed(seed)

    # Force TensorFlow to use single thread.
    # Multiple threads are potential source of non-reproducible results.
    session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)

    # The below tf.set_random_seed() will make random number generation
    tf.set_random_seed(seed)
    sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
    K.set_session(sess)
    
    return sess

sess = init_seeds(seed=0)

In [3]:
%%time

def load_transactions(path_to_file):
    df = pd.read_csv(path_to_file, usecols=['card_id', 'purchase_date'])
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    return df

old_trans = load_transactions('all/historical_transactions.csv')
new_trans = load_transactions('all/new_merchant_transactions.csv')

CPU times: user 50.9 s, sys: 15.4 s, total: 1min 6s
Wall time: 1min 16s


In [6]:
trans = pd.concat([old_trans, new_trans])

reference_date = (trans.groupby('card_id')['purchase_date']
                  .max()
                  .reset_index()
                  .rename(columns={'purchase_date': 'reference_date'})
                 )

print(pd.concat([reference_date.head(3), reference_date.tail(3)]))

                card_id      reference_date
0       C_ID_00007093c1 2018-04-09 16:23:59
1       C_ID_0001238066 2018-04-30 19:57:30
2       C_ID_0001506ef0 2018-03-22 09:14:30
325537  C_ID_ffff756266 2018-04-10 07:43:43
325538  C_ID_ffff828181 2018-04-29 18:59:29
325539  C_ID_fffffd5772 2018-03-27 13:45:10


In [None]:
reference_date.to_csv('saved_features_24-02/reference_date.csv')