# ABOUT: 
- previous experiments have shown that id features generated were effective, now we generate and save all of them for train and test sets

In [1]:
from config import *
import pandas as pd

### generate all new id features

In [2]:
id_cols = ['merchant_id', 'merchant_group_id', 'merchant_category_id','subsector_id', 'city_id', 'state_id']

In [3]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\id_columns_processed.csv"
id_columns = pd.read_csv(path, usecols = ["card_id"]+id_cols)
id_columns.head()

Unnamed: 0,card_id,city_id,merchant_category_id,merchant_id,state_id,subsector_id,merchant_group_id
0,C_ID_4e6213e9bc,city_id_88,merchant_category_id_80,M_ID_e020e9b302,state_id_16,subsector_id_37,merchant_group_id_35
1,C_ID_4e6213e9bc,city_id_88,merchant_category_id_367,M_ID_86ec983688,state_id_16,subsector_id_16,merchant_group_id_2084
2,C_ID_4e6213e9bc,city_id_88,merchant_category_id_80,M_ID_979ed661fc,state_id_16,subsector_id_37,merchant_group_id_27369
3,C_ID_4e6213e9bc,city_id_88,merchant_category_id_560,M_ID_e6d5ae8ea6,state_id_16,subsector_id_34,merchant_group_id_24104
4,C_ID_4e6213e9bc,city_id_88,merchant_category_id_80,M_ID_e020e9b302,state_id_16,subsector_id_37,merchant_group_id_35


In [13]:
# load train target variable
train = pd.read_csv(train_path, usecols = ["card_id"])
test = pd.read_csv(test_path, usecols = ["card_id"])
test

Unnamed: 0,card_id
0,C_ID_0ab67a22ab
1,C_ID_130fd0cbdd
2,C_ID_b709037bc5
3,C_ID_d27d835a9f
4,C_ID_2b5e3df5c2
...,...
123618,C_ID_7a239d2eda
123619,C_ID_75ace375ae
123620,C_ID_21d56d950c
123621,C_ID_6c46fc5a9d


In [14]:
from tqdm import tqdm

def generate_id_features(id_columns, target_id_column):
    # group by card_id, then acquire nunique_merchant_id, count_merchant_id, nunique_count_frac_merchant_id
    id_features = id_columns.groupby("card_id").agg(["nunique", "count"])
    id_features = id_features.reset_index()
    id_features.columns = ["card_id", f"nunique_{target_id_column}", f"count_{target_id_column}"]
    id_features[f"nunique_count_frac_{target_id_column}"] = id_features[f"nunique_{target_id_column}"]/id_features[f"count_{target_id_column}"]
    return id_features

for col in tqdm(id_cols):
    id_features = generate_id_features(id_columns[["card_id", col]], col)
    train = train.merge(id_features, on = "card_id", how = "left")
    test = test.merge(id_features, on = "card_id", how = "left")

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [01:29<00:00, 14.94s/it]


In [15]:
train

Unnamed: 0,card_id,nunique_merchant_id,count_merchant_id,nunique_count_frac_merchant_id,nunique_merchant_group_id,count_merchant_group_id,nunique_count_frac_merchant_group_id,nunique_merchant_category_id,count_merchant_category_id,nunique_count_frac_merchant_category_id,nunique_subsector_id,count_subsector_id,nunique_count_frac_subsector_id,nunique_city_id,count_city_id,nunique_count_frac_city_id,nunique_state_id,count_state_id,nunique_count_frac_state_id
0,C_ID_92a2005557,117,283,0.413428,88,283,0.310954,46,283,0.162544,21,283,0.074205,9,283,0.031802,3,283,0.010601
1,C_ID_3d0044924f,148,356,0.415730,107,356,0.300562,58,356,0.162921,24,356,0.067416,9,356,0.025281,3,356,0.008427
2,C_ID_d639edf6cd,14,44,0.318182,10,44,0.227273,9,44,0.204545,8,44,0.181818,5,44,0.113636,2,44,0.045455
3,C_ID_186d6a6901,57,84,0.678571,43,84,0.511905,28,84,0.333333,15,84,0.178571,7,84,0.083333,5,84,0.059524
4,C_ID_cdbd2c0db2,102,169,0.603550,80,169,0.473373,37,169,0.218935,19,169,0.112426,7,169,0.041420,7,169,0.041420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201912,C_ID_963962de2c,16,47,0.340426,13,47,0.276596,11,47,0.234043,10,47,0.212766,5,47,0.106383,2,47,0.042553
201913,C_ID_1314773c0b,29,48,0.604167,21,48,0.437500,19,48,0.395833,11,48,0.229167,3,48,0.062500,1,48,0.020833
201914,C_ID_7666735b3d,55,90,0.611111,41,90,0.455556,26,90,0.288889,18,90,0.200000,10,90,0.111111,5,90,0.055556
201915,C_ID_73f5a0efd0,24,31,0.774194,18,31,0.580645,14,31,0.451613,10,31,0.322581,2,31,0.064516,1,31,0.032258


In [16]:
test

Unnamed: 0,card_id,nunique_merchant_id,count_merchant_id,nunique_count_frac_merchant_id,nunique_merchant_group_id,count_merchant_group_id,nunique_count_frac_merchant_group_id,nunique_merchant_category_id,count_merchant_category_id,nunique_count_frac_merchant_category_id,nunique_subsector_id,count_subsector_id,nunique_count_frac_subsector_id,nunique_city_id,count_city_id,nunique_count_frac_city_id,nunique_state_id,count_state_id,nunique_count_frac_state_id
0,C_ID_0ab67a22ab,27,71,0.380282,15,71,0.211268,17,71,0.239437,13,71,0.183099,7,71,0.098592,3,71,0.042254
1,C_ID_130fd0cbdd,36,87,0.413793,26,87,0.298851,22,87,0.252874,15,87,0.172414,4,87,0.045977,3,87,0.034483
2,C_ID_b709037bc5,11,15,0.733333,10,15,0.666667,9,15,0.600000,7,15,0.466667,4,15,0.266667,5,15,0.333333
3,C_ID_d27d835a9f,33,36,0.916667,27,36,0.750000,25,36,0.694444,14,36,0.388889,3,36,0.083333,3,36,0.083333
4,C_ID_2b5e3df5c2,53,116,0.456897,38,116,0.327586,33,116,0.284483,15,116,0.129310,5,116,0.043103,4,116,0.034483
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123618,C_ID_7a239d2eda,32,73,0.438356,29,73,0.397260,18,73,0.246575,12,73,0.164384,2,73,0.027397,2,73,0.027397
123619,C_ID_75ace375ae,8,11,0.727273,6,11,0.545455,5,11,0.454545,5,11,0.454545,4,11,0.363636,2,11,0.181818
123620,C_ID_21d56d950c,16,37,0.432432,14,37,0.378378,11,37,0.297297,9,37,0.243243,3,37,0.081081,3,37,0.081081
123621,C_ID_6c46fc5a9d,25,68,0.367647,14,68,0.205882,18,68,0.264706,12,68,0.176471,5,68,0.073529,3,68,0.044118


In [17]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\train_df.pkl\more_train_id_features.pkl"
train.to_pickle(path)
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\train_df.pkl\more_test_id_features.pkl"
test.to_pickle(path)