In [35]:
import math
import spacy
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

# Note if you have train, test, validation data sets, scroll down to Client aggregations, and change load_data=True

# Load dataset

In [27]:
base_dir = "C:\\Users\\grigor.vardanyan\\Desktop\\msdata\\"

In [46]:
transactions = pd.read_csv(base + "transactions.csv")
mcc = pd.read_csv(base + "tr_mcc_codes.csv",sep="\t")
tr_types = pd.read_csv(base + "tr_types.csv")
clients = pd.read_csv(base + "gender_train.csv")

# Stats

In [50]:
counter = Counter(clients.gender.values)

In [51]:
counter

Counter({1: 3713, 0: 4687})

# Train test clients

In [52]:
clients_1 = clients.loc[clients.gender == 1]["customer_id"].values
clients_0 = clients.loc[clients.gender == 0]["customer_id"].values

In [53]:
client_1_treshold = int(len(clients_1) * 0.9)
clients_1_train =  clients_1[:client_1_treshold]
clients_1_test = clients_1[client_1_treshold:]

In [54]:
client_0_treshold = int(len(clients_0) * 0.9)
clients_0_train =  clients_0[:client_0_treshold]
clients_0_test = clients_0[client_0_treshold:]

In [55]:
clients_train = np.concatenate((clients_0_train, clients_1_train))

In [56]:
clients_test = np.concatenate((clients_0_test, clients_1_test))

# Data prepocessing

In [57]:
transactions = transactions.drop(["term_id"],axis = 1)

In [58]:
train_transactions = transactions.loc[transactions["customer_id"].isin(clients_train)]
test_transactions = transactions.loc[transactions["customer_id"].isin(clients_test)]

## Transactions of unknown clients

In [66]:
unknown_transactions = transactions.loc[~transactions["customer_id"].isin(clients["customer_id"])]

In [67]:
clients_unk = set(unknown_transactions["customer_id"])

In [68]:
len(clients_unk)

6600

## Feature - index pairs

In [182]:
mcc_unique = set(transactions["mcc_code"].values)
tr_type_unique = set(transactions["tr_type"].values)
union_features = mcc_unique.union(tr_type_unique).union(transactions["mcc_code"]).union(transactions["tr_type"])

In [4]:
agg_names = ["count","mean","std"]

In [183]:
feature_id = {"amount":0}
#Add mcc features
for feature_name in union_features:
    for agg_name in agg_names:
        key = str(feature_name) + "-" + "amount" + "-" + agg_name  
        feature_id[key] = len(feature_id)

In [None]:
with open("features_dict_gender.pkl", "wb+") as file:
    pickle.dump(feature_id, file)

## Client aggregations

In [None]:
load_data = False

In [186]:
if load_data:
    train_transactions = pd.read_csv("train_transactions.csv")
    val_transactions_df = pd.read_csv("validation_transactions.csv")
    
    with open("features_dict_gender.pkl", "rb") as file:
        feature_id = pickle.load(file)
else:
    #Join with clients dataframe train and test dataframes
    train_transactions = pd.merge(train_transactions, clients)
    test_transactions = pd.merge(test_transactions, clients)
    #Get unique clients
    train_clients_set = set(train_transactions["customer_id"].values)
    train_clients_list = list(train_clients_set)
    #Create validation
    treshold = int(len(train_clients) * 0.9)
    train_clients = train_clients_list[:treshold]
    valid_clients = train_clients_list[treshold:]
    #Split dataframe into train and validation
    train_transactions_df = train_transactions.loc[train_transactions["customer_id"].isin(train_clients)]
    val_transactions_df = train_transactions.loc[train_transactions["customer_id"].isin(valid_clients)]
    

In [37]:
#Function iterates over per client transactions(client_transactions is padas grouped dataframe) and count aggreagations
#to create sparse feature matrix
def create_sparse_matrix(client_transactions, feature_id):
    data = []
    rows_id = []
    columns_id = []
    labels = []
    
    row_id = 0
    
    for client_id, client_purchase in tqdm(client_transactions,total=len(client_transactions)):
        #Remove uneccesary columns
        mcc_df = client_purchase.drop(["customer_id","tr_datetime","tr_type","gender"],axis=1)
        tr_type_df = client_purchase.drop(["customer_id","tr_datetime","mcc_code","gender"],axis=1)
        #Aggregate based on two categorical features
        mcc_agg = mcc_df.groupby("mcc_code").agg(["count","mean","std"]).fillna(0)
        tr_type_agg = tr_type_df.groupby("tr_type").agg(["count","mean","std"]).fillna(0)
    
        for amount in client_purchase.amount.values:
            #Add amount 
            data.append(amount)   
            rows_id.append(row_id)
            columns_id.append(feature_id["amount"])
            labels.append(str(client_purchase.gender.values[0]))
        
            # Iterate ove mcc and add data
            for mcc_index in list(mcc_agg.index):
                mcc_row = mcc_agg.loc[mcc_index]
                for agg_name in agg_names:
                    amount_agg_val = mcc_row["amount"][agg_name]
                    key = str(mcc_index) + "-" + "amount" +  "-" + agg_name
                
                    #Add data
                    data.append(amount_agg_val)
                    rows_id.append(row_id)
                    columns_id.append(feature_id[key])
                
            #Iterate over transaction type and add data
            for tr_type_index in list(tr_type_agg.index):
                tr_type_row = tr_type_agg.loc[tr_type_index]
                for agg_name in agg_names:
                    amount_agg_val = tr_type_row["amount"][agg_name]
                    key = str(tr_type_index) + "-" + "amount" + "-" + agg_name
                
                    #Add data
                    data.append(amount_agg_val) 
                    rows_id.append(row_id)
                    columns_id.append(feature_id[key]) 
        
            row_id+=1
    return rows_id, columns_id, data, labels

In [39]:
validation_groups = val_transactions_df.groupby("customer_id")
train_groups = train_transactions_df.groupby("customer_id")

In [None]:
rows_id_train, columns_id_train, data_train, labels_train = create_sparse_matrix(train_groups,feature_id)

In [None]:
rows_id_val, columns_id_val, data_val, labels_val = create_sparse_matrix(validation_groups, feature_id)

 18%|█▊        | 134/756 [21:07<3:35:17, 20.77s/it]

In [None]:
rows_id_test, columns_id_test, data_test, labels_test = create_sparse_matrix(test_transactions, feature_id)

# Save data into txt file

In [43]:
def save_txt(data, file_name):
    with open(file_name,"w+") as f:
        iters = range(len(data) - 1)
        for it in tqdm(iters,total=len(iters)):
            writable = str(data[it]) + ","
            f.write(str(writable))
        f.write(str(data[-1]))

In [None]:
#Save Train data
save_txt(rows_id_train,"train_rows.txt")
save_txt(columns_id_train,"train_column.txt")
save_txt(data_train,"train_data.txt")
save_txt(labels_train,"train_labels.txt")

In [None]:
#Save valid data data
save_txt(rows_id_t_val,"valid_rows.txt")
save_txt(columns_id_val,"valid_column.txt")
save_txt(data_val,"valid_data.txt")
save_txt(labels_val,"valid_labels.txt")

In [None]:
#Save test data data
save_txt(rows_id_test,"test_rows.txt")
save_txt(columns_id_test,"test_column.txt")
save_txt(data_test,"test_data.txt")
save_txt(labels_test,"test_labels.txt")