# Preprocessing data to graph structure

In [2]:
%load_ext kedro.extras.extensions.ipython

The kedro.extras.extensions.ipython extension is already loaded. To reload it, use:
  %reload_ext kedro.extras.extensions.ipython


In [3]:
%reload_kedro

In [4]:
from typing import Iterator, Tuple
import re

import pandas as pd
import numpy as np
from kedro.extras.datasets.pandas import CSVDataSet
from kedro.io.core import get_filepath_str

import gid_ml_framework.pipelines.santander_preprocessing.nodes
from gid_ml_framework.extras.datasets.chunks_dataset import (
 _concat_chunks,
)

pd.options.mode.chained_assignment = None
pd.set_option('expand_frame_repr', True)
pd.set_option("display.max_rows", 999)
pd.set_option('max_colwidth', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

In [4]:
customers_path = "santander_customers"
articles_path = "santander_articles"
transactions_train_path = "santander_transactions_train"
transactions_val_path = "santander_transactions_val"

In [159]:
customers = context.catalog.load(customers_path)
articles = context.catalog.load(articles_path)
transactions_train = _concat_chunks(context.catalog.load(transactions_train_path))
transactions_val = _concat_chunks(context.catalog.load(transactions_val_path))

2022-08-31 10:38:53,305 - kedro.io.data_catalog - INFO - Loading data from `santander_customers` (CSVDataSet)...


No files found in ['/home/michal/projects/gid-ml-framework/conf/base', '/home/michal/projects/gid-ml-framework/conf/local'] matching the glob pattern(s): ['credentials*', 'credentials*/**', '**/credentials*']
  warn(f"Credentials not found in your Kedro project config.\n{str(exc)}")


2022-08-31 10:38:55,303 - kedro.io.data_catalog - INFO - Loading data from `santander_articles` (CSVDataSet)...
2022-08-31 10:38:55,345 - kedro.io.data_catalog - INFO - Loading data from `santander_transactions_train` (CSVDataSet)...
2022-08-31 10:38:55,467 - kedro.io.data_catalog - INFO - Loading data from `santander_transactions_val` (CSVDataSet)...


No files found in ['/home/michal/projects/gid-ml-framework/conf/base', '/home/michal/projects/gid-ml-framework/conf/local'] matching the glob pattern(s): ['credentials*', 'credentials*/**', '**/credentials*']
  warn(f"Credentials not found in your Kedro project config.\n{str(exc)}")
No files found in ['/home/michal/projects/gid-ml-framework/conf/base', '/home/michal/projects/gid-ml-framework/conf/local'] matching the glob pattern(s): ['credentials*', 'credentials*/**', '**/credentials*']
  warn(f"Credentials not found in your Kedro project config.\n{str(exc)}")
No files found in ['/home/michal/projects/gid-ml-framework/conf/base', '/home/michal/projects/gid-ml-framework/conf/local'] matching the glob pattern(s): ['credentials*', 'credentials*/**', '**/credentials*']
  warn(f"Credentials not found in your Kedro project config.\n{str(exc)}")


In [135]:
customers.shape

(93146, 45)

In [111]:
articles.shape

(24, 1)

In [113]:
transactions_train.shape

(52432, 3)

In [115]:
transactions_val.shape

(3573, 3)

In [160]:
def _create_mapping(df: pd.DataFrame, map_column: str):
    """Creates mapping into consecutive integers for given column."""
    ids = df.loc[:, map_column].sort_values().reset_index(drop=True)
    mapping = {v: k for k, v in enumerate(ids)}
    return mapping

In [161]:
users_mapping = _create_mapping(customers, map_column="customer_id")
items_mapping = _create_mapping(articles, map_column="article_id")


In [151]:
set(transactions_train.customer_id).issubset(set(users_mapping.keys()))

True

In [162]:
transactions_train.replace({"customer_id": users_mapping, "article_id": items_mapping}, inplace=True)

In [153]:
set(transactions_train.customer_id).issubset(set(users_mapping.values()))

True

In [141]:
customers

Unnamed: 0,customer_id,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,ult_fec_cli_1t,indrel_1mes,tiprel_1mes,indresi,indext,conyuemp,canal_entrada,indfall,nomprov,ind_actividad_cliente,renta,segmento,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,ind_deme_fin_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_fond_fin_ult1,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
0,15911,F,ES,V,53,1995-01-16,0,256,1,UNKNOWN,1,A,S,N,N,KAT,N,MADRID,1,191298.656250,01 - TOP,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,1
1,15913,F,ES,V,48,1995-01-16,0,256,1,UNKNOWN,1,A,S,N,N,KAT,N,MADRID,1,163073.187500,02 - PARTICULARES,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,1
2,15926,F,ES,V,56,1995-01-16,0,254,1,UNKNOWN,1,A,S,N,N,KAT,N,MADRID,1,128376.242188,02 - PARTICULARES,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
3,15950,N,ES,V,61,1995-01-16,0,256,1,UNKNOWN,1,A,S,N,N,KAT,N,MADRID,1,141979.265625,02 - PARTICULARES,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1
4,15957,F,ES,V,54,1995-01-16,0,256,1,UNKNOWN,1,A,S,N,N,KAT,N,MADRID,1,155722.015625,01 - TOP,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,1,0,1,1,1,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93141,1553596,N,ES,H,47,2016-05-31,1,0,1,UNKNOWN,P,A,S,N,UNKNOWN,UNKNOWN,N,MALAGA,0,95195.492188,UNKNOWN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
93142,1553604,N,ES,H,66,2016-05-31,1,0,1,UNKNOWN,P,A,S,S,UNKNOWN,UNKNOWN,N,MALAGA,0,95195.492188,UNKNOWN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
93143,1553609,N,ES,V,43,2016-05-31,1,0,1,UNKNOWN,P,A,S,N,UNKNOWN,UNKNOWN,N,ASTURIAS,0,86066.429688,UNKNOWN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
93144,1553627,N,ES,V,31,2016-05-31,1,0,1,UNKNOWN,P,A,S,N,UNKNOWN,UNKNOWN,N,MADRID,0,138027.750000,UNKNOWN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [163]:
transactions_train.sort_values("customer_id")

Unnamed: 0,customer_id,date,article_id
11689,0,2015-04-28,11
11031,1,2015-11-28,7
14521,1,2016-01-28,12
23396,1,2016-01-28,22
4543,1,2015-12-28,2
...,...,...,...
10738,92341,2016-04-28,6
51518,92342,2016-04-28,20
6245,92342,2016-04-28,2
10739,92343,2016-04-28,6


In [140]:
customers.loc[customers.customer_id==92343, :]

Unnamed: 0,customer_id,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,ult_fec_cli_1t,indrel_1mes,tiprel_1mes,indresi,indext,conyuemp,canal_entrada,indfall,nomprov,ind_actividad_cliente,renta,segmento,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,ind_deme_fin_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_fond_fin_ult1,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1


In [90]:
customers.shape[0]

92868

In [88]:
len(transactions.customer_id.unique())

19330

# Preprocessed data check

## Santander and H&M

In [22]:
dataset = "santander"

In [23]:
transactions_graph_path = f"{dataset}.transactions_graph"
transactions_mapped_path = f"{dataset}_transactions_mapped"
users_mapping_path = f"{dataset}_users_mapping"
items_mapping_path = f"{dataset}_items_mapping"

In [24]:
transactions_graph = _concat_chunks(context.catalog.load(transactions_graph_path))
transactions_mapped = _concat_chunks(context.catalog.load(transactions_mapped_path))
users_mapping = context.catalog.load(users_mapping_path)
items_mapping = context.catalog.load(items_mapping_path)

In [198]:
transactions_graph

Unnamed: 0,user_id,item_id,price,sales_channel_id,time
0,009cf4351b5d11682613480dc20970b91087cb0d383f70bbfd9e91244bf3cf20,0754267007,0.006763,2,1585699200
1,013b81152e2b5222a6c1c1395ab2dae691c89bb291db70da947bc0abd425f8df,0815026003,0.111847,2,1585699200
2,02001d047023157327d7a41fbcb9227dfe589ddccfab3c38942b77d964281c0a,0795790001,0.033881,2,1585699200
3,02001d047023157327d7a41fbcb9227dfe589ddccfab3c38942b77d964281c0a,0756347001,0.030492,2,1585699200
4,027d667a45acdf8f9007c8f915bb28a2952e65770f13695629a93deb33c42c5f,0852775002,0.033881,2,1585699200
...,...,...,...,...,...
157746,fdfd55a82781e51b05301b269fc91dfab6cf048f0e29e5d58c388156b14458af,0915526001,0.033881,2,1600732800
157747,ff09354db173e36e7148bd2da4da7890eaa95b00556014d9b12ffbc5980dd902,0881942001,0.033881,1,1600732800
157748,ff411f623177338bde6299009234da653cd417a6986780b4d7e72ffa2df956db,0903096002,0.042356,2,1600732800
157749,ff411f623177338bde6299009234da653cd417a6986780b4d7e72ffa2df956db,0923569007,0.059305,2,1600732800


In [199]:
transactions_mapped

Unnamed: 0,user_id,item_id,time
0,309,13585,1585699200
1,636,16774,1585699200
2,1050,15697,1585699200
3,1050,13722,1585699200
4,1325,18414,1585699200
...,...,...,...
157746,136119,20720,1600732800
157747,136695,19816,1600732800
157748,136808,20427,1600732800
157749,136808,20856,1600732800


In [None]:
items_mapping

In [25]:
transactions_mapped.loc[:, 'user_id'].value_counts().value_counts()