# Preprocessing data to graph structure

In [None]:
%load_ext kedro.extras.extensions.ipython

In [None]:
%reload_kedro

In [None]:
from typing import Iterator, Tuple
import re

import pandas as pd
import numpy as np
from kedro.extras.datasets.pandas import CSVDataSet
from kedro.io.core import get_filepath_str

import gid_ml_framework.pipelines.santander_preprocessing.nodes
from gid_ml_framework.extras.datasets.chunks_dataset import (
 _concat_chunks,
)

pd.options.mode.chained_assignment = None
pd.set_option('expand_frame_repr', True)
pd.set_option("display.max_rows", 999)
pd.set_option('max_colwidth', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

In [None]:
customers_path = "santander_customers"
articles_path = "santander_articles"
transactions_train_path = "santander_transactions_train"
transactions_val_path = "santander_transactions_val"

In [None]:
customers = context.catalog.load(customers_path)
articles = context.catalog.load(articles_path)
transactions_train = _concat_chunks(context.catalog.load(transactions_train_path))
transactions_val = _concat_chunks(context.catalog.load(transactions_val_path))

In [None]:
customers.shape

In [None]:
articles.shape

In [None]:
transactions_train.shape

In [None]:
transactions_val.shape

In [None]:
def _create_mapping(df: pd.DataFrame, map_column: str):
    """Creates mapping into consecutive integers for given column."""
    ids = df.loc[:, map_column].sort_values().reset_index(drop=True)
    mapping = {v: k for k, v in enumerate(ids)}
    return mapping

In [None]:
users_mapping = _create_mapping(customers, map_column="customer_id")
items_mapping = _create_mapping(articles, map_column="article_id")


In [None]:
set(transactions_train.customer_id).issubset(set(users_mapping.keys()))

In [None]:
transactions_train.replace({"customer_id": users_mapping, "article_id": items_mapping}, inplace=True)

In [None]:
set(transactions_train.customer_id).issubset(set(users_mapping.values()))

In [None]:
customers

In [None]:
transactions_train.sort_values("customer_id")

In [None]:
customers.loc[customers.customer_id==92343, :]

In [None]:
customers.shape[0]

In [None]:
len(transactions.customer_id.unique())

# Preprocessed data check

## Santander and H&M

In [None]:
dataset = "santander"

In [None]:
transactions_graph_path = f"{dataset}.transactions_graph"
transactions_mapped_path = f"{dataset}_transactions_mapped"
users_mapping_path = f"{dataset}_users_mapping"
items_mapping_path = f"{dataset}_items_mapping"

In [None]:
transactions_graph = _concat_chunks(context.catalog.load(transactions_graph_path))
transactions_mapped = _concat_chunks(context.catalog.load(transactions_mapped_path))
users_mapping = context.catalog.load(users_mapping_path)
items_mapping = context.catalog.load(items_mapping_path)

In [None]:
transactions_graph

In [None]:
transactions_mapped

In [None]:
items_mapping

In [None]:
transactions_mapped.loc[:, 'user_id'].value_counts().value_counts()