# Preprocessing demo

This is a demo about data preprocessing.
We take a raw transactions and convert it to `ptls` format (list with feature dicts).

Each paragraph shows a small example.

# Common tools for this demo

In [1]:
import os

if not os.path.exists('data/transactions_train.csv'):
    ! mkdir -p data
    ! curl -OL https://storage.yandexcloud.net/ptls-datasets/age-prediction-nti-sbebank-2019.zip
    ! unzip -j -o age-prediction-nti-sbebank-2019.zip 'data/*.csv' -d data
    ! mv age-prediction-nti-sbebank-2019.zip data/

In [2]:
import pandas as pd

In [3]:
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import FeatureDict

In [4]:
from IPython.display import display

In [5]:
def load_trx(keep_original_trans_date=False):
    """Load trx.
    Only for demo purpose we:
        - convert trans_date to datetime. Usually date time field are already hase datetime format.
        - load only first 1M records. Demo will be faster
        
    """
    df = pd.read_csv('data/transactions_train.csv', nrows=1e6)
    if not keep_original_trans_date:
        df['trans_date'] = (pd.to_datetime('2000-01-01') + pd.to_timedelta(df['trans_date'], 'D')).astype(str)
    df['amount_rur'] = df['amount_rur'].astype('float32')
    display(df.head())
    return df

In [6]:
def load_target():
    df = pd.read_csv('data/train_target.csv')
    display(df.head())
    return df   

In [7]:
def print_feature_dict(d):
    """Print dict with only first 10 trnasactions
    """
    print({k: v[:10] if FeatureDict.is_seq_feature(k, v) else v for k, v in d.items()})

# Base Example

In [8]:
df_trx = load_trx()

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,2000-01-07,4,71.462997
1,33172,2000-01-07,35,45.016998
2,33172,2000-01-09,11,13.887
3,33172,2000-01-10,11,15.983
4,33172,2000-01-11,11,21.341


In [9]:
preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='trans_date',
    cols_category=['small_group'],
    cols_numerical=['amount_rur'],
)

In [10]:
df_data = preprocessor.fit_transform(df_trx)

In [11]:
# unique clients count
len(df_data)

1145

In [12]:
print_feature_dict(df_data[0])

{'client_id': 6, 'small_group': tensor([ 4,  3,  1,  3,  4,  1,  4,  3, 22,  2]), 'amount_rur': tensor([ 4.0540, 13.7380, 20.7010, 21.5640, 13.4990, 23.7220,  4.3040,  8.6250,
        12.9380, 28.1620]), 'event_time': tensor([946684800, 947116800, 947548800, 947635200, 947980800, 947980800,
        948067200, 948067200, 948153600, 948240000])}


In [13]:
preprocessor.get_category_dictionary_sizes()

{'small_group': 186}

pay attention:
- `client_id` presented as scalar value
- `small_group` converted to embedding indexes with frequency encoding. There are 186 unique values incluging 0 padding token exists.
- `amount_rur` don't changed
- `event_time` converted to timestamp

# Join target

## before preprocessing

In [14]:
df_trx = load_trx()
df_target = load_target()

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,2000-01-07,4,71.462997
1,33172,2000-01-07,35,45.016998
2,33172,2000-01-09,11,13.887
3,33172,2000-01-10,11,15.983
4,33172,2000-01-11,11,21.341


Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3


In [15]:
df_trx = pd.merge(df_trx, df_target, on='client_id', how='inner')
df_trx.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur,bins
0,33172,2000-01-07,4,71.462997,0
1,33172,2000-01-07,35,45.016998,0
2,33172,2000-01-09,11,13.887,0
3,33172,2000-01-10,11,15.983,0
4,33172,2000-01-11,11,21.341,0


> **Note:**
> Target are multiplyed. Each row for `client_id` has the same `bins`

In [16]:
preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='trans_date',
    cols_category=['small_group'],
    cols_numerical=['amount_rur'],
    cols_first_item=['bins'],  # As target is the same for all transactions, lets take it from 1st record
)

In [17]:
df_data = preprocessor.fit_transform(df_trx)

In [18]:
# unique clients count
len(df_data)

1145

In [19]:
print_feature_dict(df_data[0])

{'client_id': 6, 'small_group': tensor([ 4,  3,  1,  3,  4,  1,  4,  3, 22,  2]), 'amount_rur': tensor([ 4.0540, 13.7380, 20.7010, 21.5640, 13.4990, 23.7220,  4.3040,  8.6250,
        12.9380, 28.1620]), 'bins': 1, 'event_time': tensor([946684800, 947116800, 947548800, 947635200, 947980800, 947980800,
        948067200, 948067200, 948153600, 948240000])}


pay attention:
- `client_id` presented as scalar value
- `small_group` converted to embedding indexes with frequency encoding
- `amount_rur` don't changed
- `event_time` converted to timestamp
- target value `bins` is in dict

## after preprocessing 

In [20]:
df_trx = load_trx()

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,2000-01-07,4,71.462997
1,33172,2000-01-07,35,45.016998
2,33172,2000-01-09,11,13.887
3,33172,2000-01-10,11,15.983
4,33172,2000-01-11,11,21.341


In [21]:
preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='trans_date',
    cols_category=['small_group'],
    cols_numerical=['amount_rur'],
    return_records=False,  # change output type
)

In [22]:
df_data = preprocessor.fit_transform(df_trx)

In [23]:
# unique clients count
len(df_data)

1145

In [24]:
df_data.head()

Unnamed: 0,client_id,small_group,amount_rur,event_time
0,6,"[tensor(4), tensor(3), tensor(1), tensor(3), t...","[tensor(4.0540), tensor(13.7380), tensor(20.70...","[tensor(946684800), tensor(947116800), tensor(..."
1,37,"[tensor(1), tensor(10), tensor(1), tensor(3), ...","[tensor(18.5440), tensor(22.8980), tensor(3.66...","[tensor(946684800), tensor(946857600), tensor(..."
2,91,"[tensor(19), tensor(40), tensor(1), tensor(5),...","[tensor(163.9470), tensor(2.1560), tensor(38.2...","[tensor(946857600), tensor(946944000), tensor(..."
3,172,"[tensor(3), tensor(3), tensor(3), tensor(3), t...","[tensor(13.7380), tensor(21.5640), tensor(34.5...","[tensor(948067200), tensor(948326400), tensor(..."
4,250,"[tensor(10), tensor(8), tensor(8), tensor(3), ...","[tensor(7.5110), tensor(116.4400), tensor(2.29...","[tensor(946684800), tensor(946684800), tensor(..."


> **Note:**
> `df_data` is not a `list` this is `DataFrame`

In [25]:
df_target = load_target()

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3


In [26]:
df_data = pd.merge(df_data, df_target, on='client_id', how='inner')

In [27]:
df_data = df_data.to_dict(orient='records')

In [28]:
# unique clients count
len(df_data)

1145

In [29]:
print_feature_dict(df_data[0])

{'client_id': 6, 'small_group': tensor([ 4,  3,  1,  3,  4,  1,  4,  3, 22,  2]), 'amount_rur': tensor([ 4.0540, 13.7380, 20.7010, 21.5640, 13.4990, 23.7220,  4.3040,  8.6250,
        12.9380, 28.1620]), 'event_time': tensor([946684800, 947116800, 947548800, 947635200, 947980800, 947980800,
        948067200, 948067200, 948153600, 948240000]), 'bins': 1}


pay attention:
- `client_id` presented as scalar value
- `small_group` converted to embedding indexes with frequency encoding
- `amount_rur` don't changed
- `event_time` converted to timestamp
- target value `bins` is in dict

# Without data preprocessing

Original `trans_date` in this dataset is not a datetime. But we can use if for trx ordering

In [30]:
df_trx = load_trx(keep_original_trans_date=True)

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.462997
1,33172,6,35,45.016998
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341


In [31]:
preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='trans_date',
    event_time_transformation='none',  # none transformation
    cols_category=['small_group'],
    cols_numerical=['amount_rur'],
)

In [32]:
df_data = preprocessor.fit_transform(df_trx)

In [33]:
# unique clients count
len(df_data)

1145

In [34]:
print_feature_dict(df_data[0])

{'client_id': 6, 'trans_date': tensor([ 0,  5, 10, 11, 15, 15, 16, 16, 17, 18]), 'small_group': tensor([ 4,  3,  1,  3,  4,  1,  4,  3, 22,  2]), 'amount_rur': tensor([ 4.0540, 13.7380, 20.7010, 21.5640, 13.4990, 23.7220,  4.3040,  8.6250,
        12.9380, 28.1620]), 'event_time': tensor([ 0,  5, 10, 11, 15, 15, 16, 16, 17, 18])}


pay attention:
- `client_id` presented as scalar value
- `small_group` converted to embedding indexes with frequency encoding
- `amount_rur` don't changed
- `event_time` keep original value. Trx are ordered

# Category identity encoding

`small_group` in out dataset is encoded mcc code. It seems we don't need freauency encoding.

In [35]:
df_trx = load_trx(keep_original_trans_date=True)

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.462997
1,33172,6,35,45.016998
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341


In [36]:
preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='trans_date',
    event_time_transformation='none',  # none transformation
    cols_category=['small_group'],
    category_transformation='none',
    cols_numerical=['amount_rur'],
)

In [37]:
df_data = preprocessor.fit_transform(df_trx)



> Oops. 0 is padding token. Let's shift values to 1

In [38]:
df_trx['small_group'] = df_trx['small_group'] + 1

In [39]:
df_data = preprocessor.fit_transform(df_trx)

In [40]:
# unique clients count
len(df_data)

1145

In [41]:
print_feature_dict(df_data[0])

{'client_id': 6, 'trans_date': tensor([ 0,  5, 10, 11, 15, 15, 16, 16, 17, 18]), 'small_group': tensor([16,  4,  2,  4, 16,  2, 16,  4, 38, 12]), 'amount_rur': tensor([ 4.0540, 13.7380, 20.7010, 21.5640, 13.4990, 23.7220,  4.3040,  8.6250,
        12.9380, 28.1620]), 'event_time': tensor([ 0,  5, 10, 11, 15, 15, 16, 16, 17, 18])}


In [42]:
preprocessor.get_category_dictionary_sizes()

{'small_group': 189}

pay attention:
- `client_id` presented as scalar value
- `small_group` is embedding indexes but orignal values is used. 189 indexes including 0 padding token are used. There was 186 with frequency encoding. This means that some indexes aren't presented in dataset and aren't used.
- `amount_rur` don't changed
- `event_time` keep original value. Trx are ordered