In [2]:
from config import *
import pandas as pd
# import networkx as nx
# from numpy import std
from tqdm import tqdm
import numpy as np

### read data

In [2]:
merch_cols = ['merchant_id', 'numerical_2',
               'most_recent_sales_range', 'most_recent_purchases_range',
               'avg_sales_lag3', 'avg_purchases_lag3', 'active_months_lag3',
               'avg_sales_lag6', 'avg_purchases_lag6', 'active_months_lag6',
               'avg_sales_lag12', 'avg_purchases_lag12', 'active_months_lag12',
               'category_4']

In [3]:
users_train = pd.read_csv(train_path, usecols = ["card_id"])
users_test = pd.read_csv(test_path, usecols = ["card_id"])
# users = pd.concat([users_train, users_train], axis = 0).card_id     # all user_ids
merchants = pd.read_csv(merchants_path, usecols = merch_cols)
merchants = merchants.drop_duplicates(subset = "merchant_id")
hist_trans = pd.read_csv(historical_transactions_path, usecols = ["card_id", "merchant_id"])
new_trans = pd.read_csv(new_transactions_path, usecols = ["card_id", "merchant_id"])
# trans = pd.concat([hist_trans, new_trans], axis = 0)                  # all_transactions
# data = merchants.merge(trans, how = "inner", on = "merchant_id")    # all merchants

In [4]:
cat_features = [col for col in merchants.columns if col in feature_names['merchants']['categoric']] # count, mode, 
numeric_features = [col for col in merchants.columns if col in feature_names['merchants']['numeric']]

# Engineer Categoric features

In [34]:
def encode_cat_features(df):
    df = df.copy()
    sales_range_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
    category_4_mapping = {"N":0, "Y":1}
    df["most_recent_sales_range"]  = df["most_recent_sales_range"].map(sales_range_mapping)
    df["most_recent_purchases_range"]  = df["most_recent_purchases_range"].map(sales_range_mapping)
    df["category_4"]  = df["category_4"].map(category_4_mapping)
    return df

In [90]:
def generate_cat_features(gb, cat_feature):
    count = gb[cat_feature].value_counts().unstack()
    frac = np.divide(count, count.sum(axis = 1).values.reshape(-1,1))
    count.columns = count.columns.name +'_' +count.columns.astype('str')+ '_count'
    frac.columns = frac.columns.name +'_' +frac.columns.astype('str')+ '_frac'
    return count, frac

### Compute merchant categoric features (hist)

In [96]:
# groupby
tmp = encode_cat_features(merchants)
tmp = hist_trans.merge(tmp, how = "left", on = "merchant_id")
gb = tmp.groupby("card_id")

In [98]:
# compute 
features = pd.DataFrame()
count, frac = generate_cat_features(gb, "category_4")
features = pd.concat([features,frac,count], axis = 1)
count, frac = generate_cat_features(gb, "most_recent_sales_range")
features = pd.concat([features,frac,count], axis = 1)
count, frac = generate_cat_features(gb, "most_recent_purchases_range")
features = pd.concat([features,frac,count], axis = 1)

In [107]:
# save
features.columns = "old_"+features.columns
features = features.reset_index()
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\merchant_cat_features_old.pkl"
features.to_pickle(path)

### Compute merchant categoric features (new)

In [108]:
# groupby
tmp = encode_cat_features(merchants)
tmp = new_trans.merge(tmp, how = "left", on = "merchant_id")
gb = tmp.groupby("card_id")

In [109]:
# compute 
features = pd.DataFrame()
count, frac = generate_cat_features(gb, "category_4")
features = pd.concat([features,frac,count], axis = 1)
count, frac = generate_cat_features(gb, "most_recent_sales_range")
features = pd.concat([features,frac,count], axis = 1)
count, frac = generate_cat_features(gb, "most_recent_purchases_range")
features = pd.concat([features,frac,count], axis = 1)

In [113]:
# save
features.columns = "new_"+features.columns
features = features.reset_index()
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\merchant_cat_features_new.pkl"
features.to_pickle(path)

# Engineer Numeric Features

In [20]:
# fill NAs with median
for col,count in merchants.isna().sum().items():
    if col in numeric_features and count>0:
        merchants[col] = merchants[col].fillna(merchants[col].median())

### compute merchant numeric features (old)

In [40]:
# merge
tmp = hist_trans.merge(merchants, how = "left", on = "merchant_id")
# groupby and aggregate numeric features
gb = tmp.groupby('card_id').agg(['mean','min','max','var','skew'])
# rename cols
gb.columns = [f"{col}_{agg_function}" for col, agg_function in gb.columns]

  gb = tmp.groupby('card_id').agg(['mean','min','max','var','skew'])


In [43]:
# save
gb.columns = "old_"+gb.columns
gb = gb.reset_index(drop = True)
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\merchant_numeric_features_old.pkl"
gb.to_pickle(path)

### compute merchant numeric features (new)

In [58]:
# merge
tmp = new_trans.merge(merchants, how = "left", on = "merchant_id")
# groupby and aggregate numeric features
gb = tmp.groupby('card_id').agg(['mean','min','max','var','skew'])
# rename cols
gb.columns = [f"{col}_{agg_function}" for col, agg_function in gb.columns]

  gb = tmp.groupby('card_id').agg(['mean','min','max','var','skew'])


In [63]:
# save
# gb.columns = "new_"+gb.columns
gb = gb.reset_index()
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\merchant_numeric_features_new.pkl"
gb.to_pickle(path)

# Merge and save


In [5]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\full_train2.pkl"
train = pd.read_pickle(path)
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\full_test2.pkl"
test = pd.read_pickle(path)

In [8]:
paths = [
    r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\merchant_cat_features_old.pkl",
    r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\merchant_cat_features_new.pkl",
    r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\merchant_numeric_features_old.pkl",
    r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\merchant_numeric_features_new.pkl"
]

In [25]:
new_train = train.copy()
for path in paths:
    features = pd.read_pickle(path)
    new_train = new_train.merge(features, how = "left", on = "card_id")

In [28]:
new_test = test.copy()
for path in paths:
    features = pd.read_pickle(path)
    new_test = new_test.merge(features, how = "left", on = "card_id")

In [33]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\full_train2.pkl"
new_train.to_pickle(path)
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4041 - Machine Learning\Team Project\data\full_test2.pkl"
new_test.to_pickle(path)

In [None]:
# compute degree_centrality_subgroup, mean_degree_subgroup, min_degree_subgroup, max_degree_subgroup, std_degree_subgroup
def compute_subgraph_info(card_id, interest_col, counts):
    # "state_id"s related to the card_id
    related_nodes = data[data.card_id == card_id][interest_col].unique()
    subgroup_counts = [counts[node] for node in related_nodes]
    total_nodes = sum(subgroup_counts)
    degree_centrality = len(related_nodes)/total_nodes
    average_degrees = total_nodes/len(related_nodes)
    min_group = min(subgroup_counts)
    max_group = max(subgroup_counts)
    std_group = std(subgroup_counts)
    return {"degree_centrality_subgroup":degree_centrality,
            "mean_degree_subgroup":average_degrees, 
            "min_degree_subgroup":min_group, 
            "max_degree_subgroup":max_group,
            "std_degree_subgroup":std_group}


# def generate_id_subgraphs(interest_col):
#     # number of merchants for each "state_id"
#     counts = data[interest_col].value_counts()
#     # "state_id" subgraphs
#     G = {}
#     for node,count in counts.items(): 
#         G[node] = nx.Graph()
#         G[node].add_edges_from([(node,f"{node}_{i}") for i in range(count)])
#     return G
# def generate_user_subgraph(card_id, interest_col, id_subgraphs):
#     # "state_id"s related to the card_id
#     related_nodes = data[data.card_id == card_id][interest_col].unique()
#     # add "state_id"s related to the user
#     user_subgraph = nx.Graph()
#     user_subgraph.add_edges_from([(card_id, node) for node in related_nodes])
#     # merge with relavant id subgraphs
#     for node in related_nodes:
#         user_subgraph = nx.compose(user_subgraph, id_subgraphs[node])
#     return user_subgraph