# Dataset - merchants.csv:
- id features:
    - these need some preprocessing, possibly conversion to embeddings
- numeric features:
    - some values are infinite, which need to be dealt with
    - there are dominating features, thus numeric features should be standardized 
- categoric features:
    - a suitable categoric encoding should be devised
- NAs:
    - fraction of NAs is very small, simple imputation with mean or mode should suffice
- duplicates:
    - 63 dropped

In [1]:
import pandas as pd
from IPython.display import display, HTML
def show_table(df, n = 5):
    return display(HTML(df[:n].to_html()))

merchants_path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\merchants.csv"
merchants = pd.read_csv(merchants_path)

In [2]:
id_features = ['merchant_id', 'merchant_group_id', 'merchant_category_id','subsector_id']
categoric_features = ['category_1','most_recent_sales_range', 'most_recent_purchases_range','category_4', 'city_id', 'state_id', 'category_2']
numeric_features = ['numerical_1', 'numerical_2','avg_sales_lag3', 'avg_purchases_lag3','avg_sales_lag6', 'avg_purchases_lag6','avg_sales_lag12', 'avg_purchases_lag12', 'active_months_lag3','active_months_lag6','active_months_lag12']
# len(set(id_features).union(set(categoric_features)).union(set(numeric_features))) == len(merchants.columns)

In [3]:
show_table(merchants[id_features])

Unnamed: 0,merchant_id,merchant_group_id,merchant_category_id,subsector_id
0,M_ID_838061e48c,8353,792,9
1,M_ID_9339d880ad,3184,840,20
2,M_ID_e726bbae1e,447,690,1
3,M_ID_a70e9c5f81,5026,792,9
4,M_ID_64456c37ce,2228,222,21


In [4]:
show_table(merchants[categoric_features])

Unnamed: 0,category_1,most_recent_sales_range,most_recent_purchases_range,category_4,city_id,state_id,category_2
0,N,E,E,N,242,9,1.0
1,N,E,E,N,22,16,1.0
2,N,E,E,N,-1,5,5.0
3,Y,E,E,Y,-1,-1,
4,Y,E,E,Y,-1,-1,


In [7]:
merchants[categoric_features].nunique()

category_1                       2
most_recent_sales_range          5
most_recent_purchases_range      5
category_4                       2
city_id                        271
state_id                        25
category_2                       5
dtype: int64

In [32]:
show_table(merchants[numeric_features])

Unnamed: 0,numerical_1,numerical_2,avg_sales_lag3,avg_purchases_lag3,avg_sales_lag6,avg_purchases_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag3,active_months_lag6,active_months_lag12
0,-0.057471,-0.057471,-0.4,9.666667,-2.25,18.666667,-2.32,13.916667,3,6,12
1,-0.057471,-0.057471,-0.72,1.75,-0.74,1.291667,-0.57,1.6875,3,6,12
2,-0.057471,-0.057471,-82.13,260.0,-82.13,260.0,-82.13,260.0,2,2,2
3,-0.057471,-0.057471,,1.666667,,4.666667,,3.833333,3,6,12
4,-0.057471,-0.057471,,0.5,,0.361111,,0.347222,3,6,12


In [8]:
merchants[numeric_features].describe()

Unnamed: 0,numerical_1,numerical_2,avg_sales_lag3,avg_purchases_lag3,avg_sales_lag6,avg_purchases_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag3,active_months_lag6,active_months_lag12
count,334696.0,334696.0,334683.0,334696.0,334683.0,334696.0,334683.0,334696.0,334696.0,334696.0,334696.0
mean,0.011476,0.008103,13.832993,inf,21.65079,inf,25.22771,inf,2.994108,5.947397,11.599335
std,1.098154,1.070497,2395.489999,,3947.108,,5251.842,,0.095247,0.394936,1.520138
min,-0.057471,-0.057471,-82.13,0.3334953,-82.13,0.1670447,-82.13,0.09832954,1.0,1.0,1.0
25%,-0.057471,-0.057471,0.88,0.9236499,0.85,0.9022475,0.85,0.8983333,3.0,6.0,12.0
50%,-0.057471,-0.057471,1.0,1.016667,1.01,1.026961,1.02,1.043361,3.0,6.0,12.0
75%,-0.047556,-0.047556,1.16,1.146522,1.23,1.215575,1.29,1.26648,3.0,6.0,12.0
max,183.735111,182.079322,851844.64,inf,1513959.0,inf,2567408.0,inf,3.0,6.0,12.0


### statistics

In [134]:
def show_stats(temp_df):
    na_frac = dict(temp_df.isna().sum()/len(temp_df)).values()
    num_categories = [len(temp_df[col].unique()) if col in id_features+categoric_features else pd.NA for col in temp_df  ]
    numeric_mean = [temp_df[col].mean() if col in numeric_features else pd.NA for col in temp_df]
    numeric_min = [temp_df[col].min() if col in numeric_features else pd.NA for col in temp_df]
    numeric_max = [temp_df[col].max() if col in numeric_features else pd.NA for col in temp_df]
    temp_data = {
        "na_frac":na_frac,
        "num_categories":num_categories,
        "numeric_mean":numeric_mean,
        "numeric_min":numeric_min,
        "numeric_max":numeric_max
    }
    return pd.DataFrame(temp_data, index = temp_df.columns)
show_stats(merchants)

Unnamed: 0,na_frac,num_categories,numeric_mean,numeric_min,numeric_max
merchant_id,0.0,334633.0,,,
merchant_group_id,0.0,109391.0,,,
merchant_category_id,0.0,324.0,,,
subsector_id,0.0,41.0,,,
numerical_1,0.0,,0.011476,-0.057471,183.735111
numerical_2,0.0,,0.008103,-0.057471,182.079322
category_1,0.0,2.0,,,
most_recent_sales_range,0.0,5.0,,,
most_recent_purchases_range,0.0,5.0,,,
avg_sales_lag3,3.9e-05,,13.832993,-82.13,851844.64


In [91]:
len(merchants)

334696

In [111]:
def drop_duplicates(df, id_col = "merchant_id"):
    is_duplicates = df.merchant_id.duplicated(keep = "first")
    df = df[~is_duplicates]
    print(f"{sum(is_duplicates)} duplicates dropped")
    return df

In [112]:
drop_duplicates(merchants)

63 duplicates dropped


Unnamed: 0,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,M_ID_838061e48c,8353,792,9,-0.057471,-0.057471,N,E,E,-0.40,...,-2.25,18.666667,6,-2.32,13.916667,12,N,242,9,1.0
1,M_ID_9339d880ad,3184,840,20,-0.057471,-0.057471,N,E,E,-0.72,...,-0.74,1.291667,6,-0.57,1.687500,12,N,22,16,1.0
2,M_ID_e726bbae1e,447,690,1,-0.057471,-0.057471,N,E,E,-82.13,...,-82.13,260.000000,2,-82.13,260.000000,2,N,-1,5,5.0
3,M_ID_a70e9c5f81,5026,792,9,-0.057471,-0.057471,Y,E,E,,...,,4.666667,6,,3.833333,12,Y,-1,-1,
4,M_ID_64456c37ce,2228,222,21,-0.057471,-0.057471,Y,E,E,,...,,0.361111,6,,0.347222,12,Y,-1,-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334691,M_ID_1f4773aa76,1145,705,33,3.174788,-0.047556,N,A,A,1.00,...,0.99,1.019505,6,1.00,1.024796,12,Y,69,9,1.0
334692,M_ID_725a60d404,35,544,29,-0.057471,-0.057471,Y,A,A,0.89,...,0.78,0.813473,6,0.59,0.606765,12,Y,-1,-1,
334693,M_ID_f2045dd267,35,561,7,-0.057471,-0.057471,N,A,A,0.96,...,0.90,0.924769,6,0.74,0.750763,8,Y,160,21,5.0
334694,M_ID_9139332ccc,35,511,7,-0.057471,-0.057471,Y,A,A,0.94,...,0.82,0.783000,6,0.65,0.584000,12,Y,-1,-1,


In [29]:
merchants.sample(1)

Unnamed: 0,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
79173,M_ID_5a1874b22a,31427,278,37,-0.057471,-0.057471,N,E,D,1.42,...,1.63,1.428363,6,1.71,1.512427,12,N,21,9,1.0


# Dataset - train.csv

In [5]:
import pandas as pd
from IPython.display import display, HTML
def show_table(df, n = 5):
    return display(HTML(df[:n].to_html()))

train_path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\train.csv"
train = pd.read_csv(train_path)
show_table(train)

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11,C_ID_cdbd2c0db2,1,3,0,-0.159749


In [9]:
assert len(train.card_id.unique()) == len(train)

In [27]:
len(train)

201917

# Dataset - test.csv
- card_id
    - all card ids in test.csv are not seen in train.csv

In [29]:
import pandas as pd
from IPython.display import display, HTML
def show_table(df, n = 5):
    return display(HTML(df[:n].to_html()))

test_path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\test.csv"
test = pd.read_csv(test_path)
show_table(test)

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
0,2017-04,C_ID_0ab67a22ab,3,3,1
1,2017-01,C_ID_130fd0cbdd,2,3,0
2,2017-08,C_ID_b709037bc5,5,1,1
3,2017-12,C_ID_d27d835a9f,2,1,0
4,2015-12,C_ID_2b5e3df5c2,5,1,1


In [30]:
assert len(test.card_id.unique()) == len(test)
assert len(set(test.card_id.unique()) & set(train.card_id.unique())) == 0

In [31]:
len(test)

123623

# Dataset - historical_transactions.csv:

In [24]:
import pandas as pd
from IPython.display import display, HTML
def show_table(df, n = 5):
    return display(HTML(df[:n].to_html()))

transactions_path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\historical_transactions.csv"
transactions = pd.read_csv(transactions_path)
show_table(transactions)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


In [25]:
len(transactions)

29112361

In [45]:
new_merchants[new_merchants.merchant_id == "M_ID_f5d4ce2ee6"]

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
140825,Y,C_ID_afa3d8db6c,47,N,1,B,307,M_ID_f5d4ce2ee6,1,-0.596643,2018-03-06 08:20:25,1.0,16,19
1480443,Y,C_ID_dfd2ce0703,47,N,0,A,307,M_ID_f5d4ce2ee6,1,-0.731881,2017-11-15 05:43:54,1.0,16,19
1937190,Y,C_ID_10214008c5,47,N,1,B,307,M_ID_f5d4ce2ee6,1,-0.671775,2017-09-08 17:01:54,1.0,16,19


In [46]:
merchants[merchants.merchant_id == "M_ID_f5d4ce2ee6"]

Unnamed: 0,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
242458,M_ID_f5d4ce2ee6,77684,307,19,-0.047556,-0.047556,N,D,D,1.41,...,1.72,2.152199,6,1.82,2.315104,12,N,47,16,1.0


In [39]:
set(merchants.merchant_id).intersection(set(new_merchants.merchant_id))

{'M_ID_de0f52a017',
 'M_ID_e163d2c662',
 'M_ID_6095eced09',
 'M_ID_f5d4ce2ee6',
 'M_ID_ce4ea19db1',
 'M_ID_57c99b132f',
 'M_ID_4635109c52',
 'M_ID_0f4b195d3d',
 'M_ID_beede9ea7d',
 'M_ID_1dbb95ce04',
 'M_ID_df9d0179b8',
 'M_ID_c410ad7276',
 'M_ID_97326753e0',
 'M_ID_d51a2bba1a',
 'M_ID_172e373a95',
 'M_ID_ee417017a3',
 'M_ID_e56dc6a0a5',
 'M_ID_6748654747',
 'M_ID_1a17ebbd14',
 'M_ID_48b0bf3cce',
 'M_ID_a4c1ec9cf4',
 'M_ID_cfd2f0ad3a',
 'M_ID_f72b7f325d',
 'M_ID_da5ef19645',
 'M_ID_bf6b377ad5',
 'M_ID_ee3beb005f',
 'M_ID_40df701416',
 'M_ID_8b7ccce657',
 'M_ID_28cbe6a15b',
 'M_ID_77bed497f4',
 'M_ID_4173fa1bd0',
 'M_ID_d82c61f09b',
 'M_ID_898b33285e',
 'M_ID_d5c46422f2',
 'M_ID_b86cca88cb',
 'M_ID_aefed642b7',
 'M_ID_fc3b5454ce',
 'M_ID_5a5d6cda67',
 'M_ID_6347c5cd5f',
 'M_ID_0aaefb3089',
 'M_ID_0f37bd97bf',
 'M_ID_afadb3897c',
 'M_ID_11fbe98450',
 'M_ID_f66b936f6b',
 'M_ID_84f1ff55b9',
 'M_ID_f0508775cb',
 'M_ID_0ad12e759f',
 'M_ID_7b2b8300e0',
 'M_ID_f5210169a7',
 'M_ID_5d2740f1d7',


# Dataset - new_merchant_transactions.csv:
- new_merchant_transactions has exactly the columns of historical_transactions


In [2]:
import pandas as pd
from IPython.display import display, HTML
def show_table(df, n = 5):
    return display(HTML(df[:n].to_html()))

new_merchants_path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\new_merchant_transactions.csv"
new_merchants = pd.read_csv(new_merchants_path)
show_table(new_merchants)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_415bb3a509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19
1,Y,C_ID_415bb3a509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19
2,Y,C_ID_415bb3a509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14
3,Y,C_ID_415bb3a509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,,-1,8
4,Y,C_ID_ef55cf8d4b,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,,-1,29


In [23]:
set(transactions.columns) == set(new_merchants.columns)

True

In [16]:
new_merchants.isna().sum()

authorized_flag              0
card_id                      0
city_id                      0
category_1                   0
installments                 0
category_3               55922
merchant_category_id         0
merchant_id              26216
month_lag                    0
purchase_amount              0
purchase_date                0
category_2              111745
state_id                     0
subsector_id                 0
dtype: int64

# Plan:
- first batch/benchmark dataset:
    - concat historial and new with an indicator column
    - merge transactions with merchants on merchant_id
    - remove all id columns
    - deal with NAs and duplicates
    - categorical encoding
    - numeric standardization
    - groupby card_id then aggregate
    - merge with train.csv on card_id
    

- subsequent batches:
    - next, aggregated features (all transactional data merged but differentiated between new and old)
    - next, id feature encodings e.g embeddings
    - next, date feature encodings e.g cyclic features

In [2]:
%%timeit
"hi!" in ("Hello","Supp","HI","hi!")

62.6 ns ± 4.33 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [3]:
%%timeit
"hi!" in ["Hello","Supp","HI","hi!"]

67.2 ns ± 17.3 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [6]:
a_set = set(["Hello","Supp","HI","hi!"])

In [7]:
%%timeit
"hi!" in a_set

51.1 ns ± 2.52 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
