In [1]:
import numpy as np
import pandas as pd
from os.path import exists
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model

In [17]:
# Preprocess transactions
def transaction_preprocess(transactions, train_bounds, test_bounds, time_period = 125):
    """
    args:
        transactions: pd.DataFrame -> the transaction_history.csv provided by Siemens
        train_bounds, test_bounds: (lower: int, upper: int) -> bounds for splitting dataset into train and test based on "day" feature
        time_period: map day feature into month, years, etc. (default 125 days = 4 months)
    returns:
        transactions: pd.DataFrame -> processed dataframe with encoded categories and mapped days
        train_rnd_sample: pd.DataFrame -> all positive samples within train as well as an equal number of randomly sampled negative samples
        test_rnd_sample: pd.DataFrame -> 
    """
    transactions = transactions.copy() 
    transactions["day"] = transactions["day"] // time_period # map days to weeks, months, years
    
    # encode categorical variables
    transactions["prod"] = transactions["prod"].astype('category').cat.codes 
    transactions["customer"] = transactions["customer"].astype('category').cat.codes
    transactions["state"] = transactions["state"].astype('category').cat.codes + 1
    transactions["ind_code"] = transactions["ind_code"].astype('category').cat.codes + 1
    transactions["ind_seg_code"] = transactions["ind_seg_code"].astype('category').cat.codes + 1
    
    # set labels to 1 - all products present were bought
    transactions["label"] = 1
    
    # create product customer list
    transactions["prod_customer"] = list(zip(transactions["customer"], transactions["prod"]))
    # drop rows with missing values
    transaction = transactions.dropna()
    
    # create train & test set
    train = transactions[(transaction.day >= train_bounds[0] // time_period) & (transactions.day < train_bounds[1] // time_period)]
    test = transactions[(transaction.day >= test_bounds[0] // time_period) & (transactions.day < test_bounds[1] // time_period)]
    
    # get unique customer and product lists
    customers = transactions["customer"].unique()
    products = transactions["prod"].unique()
    
    # set of day-customer-bought product for negative sampling
    hit_combo = set(list(zip(transactions["day"], transactions["customer"], transactions["prod"])))
    
    # get days and customers
    test_days = test["day"].unique()
    hit_combo_2 = list(zip(test["day"], test["customer"]))
    
    # create simple test generator for customer (returns: pd.DataFrame with all days in test with all products for customer)
    # generator necessary on my PC due to memory constraints: might not be on yours
    prod_list = set(products)
    test_customer_prod_day = set()
    def test_customer_generator(customer_id):
        final_neg_sampl = test[test.customer == customer_id][["customer", "state", "ind_code", "ind_seg_code"]].drop_duplicates() # customer info - customer_id unique identifier of customer
        final_neg_sampl = final_neg_sampl.merge(pd.DataFrame(test_days, columns=["day"]), how="cross") # cartezian product of day and customer info
        final_neg_sampl = final_neg_sampl.merge(pd.DataFrame(products, columns=["prod"]), how="cross")
        final_neg_sampl["label"] = 0
        for ind, row in final_neg_sampl.iterrows():
            print((row["prod"], row["day"]))
            if (row["prod"], row["day"]) in test_customer_prod_day:
                row["label"] = 1
        return final_neg_sampl
            
    prod_samp_size = 2000
    prod_rnd_sample = np.array([np.random.choice(products, size=prod_samp_size * len(hit_combo_2))]).T
    test_rnd_sample = [(tday, tcustomer, prod_rnd_sample[i][0]) for i in range(prod_samp_size) for tday, tcustomer in hit_combo_2]


    not_faulty = [tuple(test_rnd_sample[row]) not in hit_combo for row in range(len(test_rnd_sample))]
    test_rnd_sample = np.array(test_rnd_sample)
    test_rnd_sample = test_rnd_sample[not_faulty]

    # Random sample
    test_rnd_sample = pd.DataFrame(test_rnd_sample,
                                       columns=["day", "customer", "prod"]).drop_duplicates()
    test_rnd_sample = test_rnd_sample.merge(transactions[["customer", "domestic", "state", "ind_code",
                                                                "ind_seg_code"]].drop_duplicates(),
                                                  on="customer")
    test_rnd_sample["label"] = 0
    test_rnd_sample = test_rnd_sample.append(test[["day", "customer", "prod", "domestic", "state", "ind_code",
                                                                "ind_seg_code", "label"]], ignore_index=True)
    
    SAMPLE_SIZE = int(np.ceil(1.0 * len(train)))
    train_days = train["day"].unique()
    
    train_rnd_sample = np.array([np.random.choice(train_days, size=SAMPLE_SIZE),\
                                              np.random.choice(customers, size=SAMPLE_SIZE),\
                                              np.random.choice(products, size=SAMPLE_SIZE)]).T
    
    not_faulty = [tuple(train_rnd_sample[row,:]) not in hit_combo for row in range(train_rnd_sample.shape[0])]
    train_rnd_sample = train_rnd_sample[not_faulty]
    
    # Random sample
    train_rnd_sample = pd.DataFrame(train_rnd_sample,
                                   columns=["day", "customer", "prod"]).drop_duplicates()
    train_rnd_sample = train_rnd_sample.merge(transactions[["customer", "domestic", "state", "ind_code",
                                                            "ind_seg_code"]].drop_duplicates(),
                                              on="customer")
    train_rnd_sample["label"] = 0
    train_rnd_sample = train_rnd_sample.append(train[["day", "customer", "prod", "domestic", "state", "ind_code",
                                                            "ind_seg_code", "label"]], ignore_index=True)
    
    return transactions, train_rnd_sample.sample(frac=1).reset_index(drop=True), test_customer_generator, products, customers

In [None]:
transactions = pd.read_csv("dataset/transaction_history.csv")
transformations = []
for i in range(6):
    transformed, train, test_generator, products, customers = transaction_preprocess(transactions, (i*125, (i+4)*125), ((i+4)*125, (i+5)*125))
    transformations.append((train, test_generator))
train, test_generator, products, customers = transformations[0]

In [11]:
transformed, train, test_generator, products, customers =  transformations[0]
test_generator(0)

Unnamed: 0,customer,state,ind_code,ind_seg_code,day,prod,label
0,0,5,12,23,4,6474,0
1,0,5,12,23,4,10082,0
2,0,5,12,23,4,10201,0
3,0,5,12,23,4,10198,0
4,0,5,12,23,4,10939,0
...,...,...,...,...,...,...,...
12299,0,5,12,23,4,558,0
12300,0,5,12,23,4,2813,0
12301,0,5,12,23,4,2809,0
12302,0,5,12,23,4,2703,0


In [12]:
transformed, train, test_generator, products, customers =  transformations[1]
test_generator(0)

Unnamed: 0,day,customer,prod,domestic,state,ind_code,ind_seg_code,label
0,0,416,2896,True,7,12,23,0
1,3,1373,6804,True,8,16,31,0
2,3,948,4305,True,5,2,2,0
3,1,2596,8368,True,5,12,23,1
4,2,1374,3080,True,3,16,30,0
...,...,...,...,...,...,...,...,...
453877,2,2170,6429,True,8,22,42,0
453878,0,2461,5331,True,5,4,7,0
453879,2,2016,8122,True,7,0,0,1
453880,0,350,2900,True,5,5,10,0


In [16]:
for i in customers:
    test_generator(i)

In [14]:
customers

array([  48, 1472,  229, ..., 2493, 1064, 2033], dtype=int16)