In [None]:
import pandas as pd
import os

## Load (merchant, card)-ids from data files

In [None]:
df_train = pd.read_csv('../input/train.csv')[['card_id', 'target']]
df_test = pd.read_csv('../input/test.csv')[['card_id']]

In [None]:
df_hist = pd.read_csv('../input/historical_transactions.csv')[['card_id', 'merchant_id']]
df_new = pd.read_csv('../input/new_merchant_transactions.csv')[['card_id', 'merchant_id']]

In [None]:
# list of `card_id`:s in historical vs new transaction data
hist_card_ids = list(df_hist['card_id'])
new_card_ids = list(df_new['card_id'])

# list of `card_id`:s in train vs test data
train_card_ids = list(df_train['card_id'])
test_card_ids = list(df_test['card_id'])

# list of `merchant_id`:s in historical vs new transaction data
hist_merchant_ids = list(df_hist['merchant_id'])
new_merchant_ids = list(df_new['merchant_id'])

# Is `card_id` and `merchant_id` always defined?

In [None]:
import math
def set_contains_nan(xs):
    def is_nan(x):
        if isinstance(x, str):
            return False
        return math.isnan(x)
    
    return any({is_nan(x) for x in xs})

assert(not set_contains_nan(set(["x", "y"])))
assert(set_contains_nan(set(["x", "y", float('nan')])))

In [None]:
# `card_id` is never None in any of the tables
id_set_list = [hist_card_ids, new_card_ids, 
           train_card_ids, test_card_ids]
[set_contains_nan(id_set) for id_set in id_set_list]

In [None]:
# `merchant_id` columns contain NaN/NULL values in both tables
[set_contains_nan(id_set) for id_set in [new_merchant_ids, hist_merchant_ids]]

# Relation between `card_id`:s in different tables

In [None]:
print("nr unique card id:s in training                    ", len(set(train_card_ids)))
print("nr unique card id:s in test                        ", len(set(test_card_ids)))
print("nr unique card id:s in historical_transactions     ", len(set(hist_card_ids)))
print("nr unique card id:s in new_merchant_transactions   ", len(set(new_card_ids)))

In [None]:
# test and training do not contain any common card_id
assert set(test_card_ids) & set(train_card_ids) == set()

In [None]:
# test and train do not contain any duplicate card id:s
assert len(test_card_ids) == len(set(test_card_ids))
assert len(train_card_ids) == len(set(train_card_ids))

In [None]:
# `historical_transactions` contain transactions about exactly the cards appearing in test+train
assert set(test_card_ids) | set(train_card_ids) == set(hist_card_ids)

In [None]:
# all card_id:s in `new_merchant_transactions` are also present in `historical_transactions`
assert set(new_card_ids) < set(hist_card_ids)

# Relation between `merchant_id`:s in different tables

In [None]:
print("nr unique merchant id:s in historical_transactions     ", len(set(hist_merchant_ids)))
print("nr unique merchant id:s in new_merchant_transactions   ", len(set(new_merchant_ids)))

In [None]:
# `new_merchant_transactions` contains merchants not present in `hist_merchant_transactions`
assert not set(new_merchant_ids) < set(hist_merchant_ids)

In [None]:
set_A = set(new_merchant_ids) - set(hist_merchant_ids)
set_B = set(new_merchant_ids) & set(hist_merchant_ids)
set_C = set(new_merchant_ids) | set(hist_merchant_ids)
set_D = set(hist_merchant_ids) - set(new_merchant_ids)

print("A: nr unique merchant id:s in new_merchant_transactions \ historical_transactions ", len(set_A))
print("B: nr unique merchant id:s in new_merchant_transactions & historical_transactions ", len(set_B))
print("C: nr unique merchant id:s in new_merchant_transactions | historical_transactions ", len(set_C))
print("D: nr unique merchant id:s in historical_transactions \ new_merchant_transactions ", len(set_D))

# Purchases in `new_merchant_transactions` are new

The below motivates why transactions in the `new_merchant_transactions` are called "new".

In [None]:
new_pairs = [str(p) for p in zip(new_card_ids, new_merchant_ids)]
hist_pairs = [str(p) for p in zip(hist_card_ids, hist_merchant_ids)]

In [None]:
hist_pairs[:3]

All transactions in `new_merchant_transactions` represent purchases where the card holder buys something from a merchant that the card holder has not visited before (at least based on transactions in historical_transactions):

In [None]:
set(new_pairs) & set(hist_pairs)

# Relation to target variable

For each `card_id`, let's explore the set of visited `merchant_id`:s in `historical_transactions` vs `new_merchant_transactions`, and the relation between these sets to the target variable.

In [None]:
def make_lookup(df):
    # make lookup for `card_id` -> set of `merchant_id`:s visited by card holder
    lookup = {}
    for card_id, df_group in df.groupby('card_id'):
        lookup[card_id] = set(df_group['merchant_id'])
    return lookup

In [None]:
hist_lookup = make_lookup(df_hist)
new_lookup = make_lookup(df_new)

In [None]:
res = []
for card_id in set(train_card_ids):
    # merchant id:s in historical_transactions
    hist_set = hist_lookup[card_id] if card_id in hist_lookup else set()
    # merchant id:s in new_merchant_transactions
    new_set = new_lookup[card_id] if card_id in new_lookup else set()
    
    res += [
        {
            'card_id': card_id,
            # nr of merchant id:s visited by card holder in historical_transactions
            'nr_hist': len(hist_set),
            # nr of merchant id:s visited by card holder in new_merchant_transactions
            'nr_new': len(new_set),
            # total nr of merchant id:s visited by card holder
            'nr_total': len(new_set | hist_set),
            # nr of merchant id:s visited by card holder in both historical_transactions and new_merchant_transactions
            # This is always zero (see above)
            'nr_common': len(new_set & hist_set),
            # ratio of new to total
            'ratio_new_to_total': int(100 * len(new_set)/len(hist_set | new_set))
        }
    ]
df_ratio = pd.DataFrame(res)

In [None]:
# add `target` variable
df_ratio = df_ratio.merge(df_train, on='card_id', how='inner')
assert len(df_ratio) == len(res)
assert set(df_ratio['nr_common']) == set([0])

## Plotting/summary

In [None]:
import seaborn as sns
sns.set(style="ticks")
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

The below plot shows that (per `card_id`), the target variable **decreases** (ie depends on `nr_new`) as `nr_new` (=nr of merchant id:s visited by card holder in new_merchant_transactions) **increases**.

This dependency on the target variable makes sense in view of the competition [description](https://www.kaggle.com/c/elo-merchant-category-recommendation) (emphasis added)

> Right now, Elo, one of the largest payment brands in Brazil, has built partnerships with merchants in order to offer promotions or discounts to cardholders. **But do these promotions work for either the consumer or the merchant?** Do customers enjoy their experience? Do merchants see repeat business? Personalization is key.

This suggests that it will be benefitial to add features (per `card_id`) that predict how likely the card holder is to visit new merchants (or possibly more specifically: visit merchants that are being promoted if this information is included in some of the anonymous variables)

In [None]:
sns.lineplot(x = "nr_new", y = "target", markers = True, dashes = False, 
             data = df_ratio[df_ratio.nr_new < 50], label="nr_new")

It is instructive to plot the above graph together with `nr_hist`, which was the 5th most important feature (labeled `hist_merchant_id_nunique`) in 
Peter Hurford's [You're Going to Want More Categories](https://www.kaggle.com/peterhurford/you-re-going-to-want-more-categories-lb-3-737). Strangly (?), the `nr_new` plotted above (and labeled `new_merchant_id_nunique` in the linked notebook) is at rank 42.

In [None]:
sns.lineplot(x = "nr_new", y = "target", markers = True, dashes = False, 
             data = df_ratio[df_ratio.nr_new < 50], label="nr_new")

sns.lineplot(x = "nr_hist", y = "target", markers = True, dashes = False, 
             data = df_ratio[df_ratio.nr_hist < 50], label="nr_hist")

As the graph shows, `target` also has a slight negative dependence on  `nr_hist`, but not as strong as for `nr_new`. It is very possible that the plot misses some other dependency. Ie., there might be a stronger dependency on `nr_hist`, but to see this one would need to include additional variables (?).

In [None]:
# Combining new vs hist we can also create a ratio that shows the same decreasing dependency:
sns.lineplot(x = "ratio_new_to_total", y = "target", markers = True, dashes = False, 
             data = df_ratio[df_ratio.ratio_new_to_total < 75])

In [None]:
# Check: express target=-0.5 in terms of percentiles
import numpy as np
np.percentile(df_train['target'], 34.35)