# Santander data preprocessing

In [None]:
%load_ext kedro.extras.extensions.ipython

In [None]:
%reload_kedro

In [None]:
from typing import Iterator, Union, Tuple, List
from datetime import datetime

import pandas as pd
import numpy as np
import pprint
from kedro.extras.datasets.pandas import CSVDataSet
from kedro.io.core import get_filepath_str
from sklearn.model_selection import train_test_split

import recommender_gnn.pipelines.santander_preprocessing.nodes
from recommender_gnn.extras.datasets.chunks_dataset import (
 _load,
 _concat_chunks,
)
from recommender_gnn.pipelines.santander_preprocessing.nodes import(
    _stratify
)

pd.options.mode.chained_assignment = None
pd.set_option('expand_frame_repr', True)
pd.set_option("display.max_rows", 999)
pd.set_option('max_colwidth', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

## Checking input dtypes and memory usage

In [None]:
input_train_path = "santander_train_input"
input_test_path = "santander_test_input"

In [None]:
train_input = _concat_chunks(context.catalog.load(input_train_path))

In [None]:
test_input = _concat_chunks(context.catalog.load(input_test_path))

In [None]:
train_input.loc[train_input.fecha_dato=='2015-11-28', :].shape

In [None]:
train_input.drop_duplicates("ncodpers", keep='last').shape

In [None]:
test_input.drop_duplicates("ncodpers", keep='last').shape

# Saving testing fixture

In [None]:
train_shuffle = train_input.sample(frac=0.0001)

In [None]:
train_shuffle.to_csv("../src/tests/fixtures/csv/santander_test_sample.csv")

In [None]:
train_input.info()

In [None]:
train_input.memory_usage(deep=True)

In [None]:
train_input.isnull()

## Comparing cleaned train and test dataframes

In [None]:
cleaned_train_path = "santander_preprocessing_train.santander_cleaned"
cleaned_test_path = "santander_preprocessing_test.santander_cleaned"

In [None]:
cleaned_train = _concat_chunks(context.catalog.load(cleaned_train_path))
cleaned_test = _concat_chunks(context.catalog.load(cleaned_test_path))

In [None]:
lasts = cleaned_train.sort_values(["ncodpers", "fecha_dato"]).groupby("ncodpers").last()

In [None]:
lasts.fecha_dato.unique()

In [None]:
cleaned_train.isnull().sum()

In [None]:
np.unique(cleaned_train.loc[:, "renta"])

In [None]:
np.unique(cleaned_test.loc[:, "renta"])

In [None]:
df = cleaned_train.copy()
mask = df['renta'].map(lambda x: isinstance(x, (int, float)))
df.loc[:, 'renta'] = df.loc[:, 'renta'].where(mask)
np.unique(df.loc[:, "renta"])

In [None]:
df = cleaned_test
pd.to_numeric(df.loc[:, 'renta'], downcast="float", errors='coerce')

## Comparing final dataframes

In [None]:
train_path = "santander_train"
val_path = "santander_val"
test_path = "santander_test"

In [None]:
train = _concat_chunks(context.catalog.load(train_path))
val = _concat_chunks(context.catalog.load(val_path))
test = _concat_chunks(context.catalog.load(test_path))

# Saving testing fixtures

In [None]:
train_sample = train.sample(frac=0.01)
print(train_sample.shape)
val_sample = val.sample(frac=0.1)
print(val_sample.shape)

In [None]:
train_sample.to_csv("../src/tests/fixtures/csv/santander_train_preprocessed_bigger_sample.csv")
val_sample.to_csv("../src/tests/fixtures/csv/santander_val_preprocessed_bigger_sample.csv")

In [None]:
train.head()

In [None]:
train.dtypes

In [None]:
val.head()

In [None]:
val.dtypes

In [None]:
test.head()

In [None]:
test.dtypes

In [None]:
test.dtypes == train.loc[:, test.columns].dtypes

In [None]:
test.dtypes == val.loc[:, test.columns].dtypes