In [12]:
import ptls
import pandas as pd
import os
from ptls.preprocessing import PandasDataPreprocessor
import numpy as np
import torch
from sklearn.preprocessing import KBinsDiscretizer
from scipy.stats import chisquare


In [2]:
def prepare_data_gender_scenario():
    data_path = '../data/gender'

    source_data = pd.read_csv(os.path.join(data_path, 'transactions.csv'))
    source_data = source_data.drop(columns=["term_id"]).rename(columns={'customer_id' : 'client_id'})
    if 'Unnamed: 0' in source_data.columns:
        source_data = source_data.drop(columns=['Unnamed: 0'])

    source_data['time'] = [i.split()[1] for i in source_data.tr_datetime.values]

    padded_time = source_data['tr_datetime'].str.pad(15, 'left', '0')
    day_part = padded_time.str[:6].astype(float)
    time_part = pd.to_datetime(padded_time.str[7:], format='%H:%M:%S').values.astype('int64') // 1e9
    time_part = time_part % (24 * 60 * 60) / (24 * 60 * 60)
    
    source_data.tr_datetime = day_part + time_part

    df_params = {
        "numeric_cols" : ["amount"],
        "cat_cols" : ["mcc_code", "tr_type"],
        "cat_unique" : [],
        "order_col" : "tr_datetime",
        "time_col": "time",
        "text_cols" : ['description'],
        "id_col" : "client_id",
        "target" : "gender"
    }

    for f in df_params["cat_cols"]:
        df_params["cat_unique"].append(source_data[f].unique().shape[0])

    targets = pd.read_csv(os.path.join('../data/gender', 'gender_train.csv')).rename(columns={'customer_id' : 'client_id'})
    targets = source_data[['client_id']].drop_duplicates().merge(targets, on='client_id', how='left').dropna() 
    
    return source_data, targets, df_params

In [3]:
data, targets, df_params = prepare_data_gender_scenario()

In [24]:
preprocessor = PandasDataPreprocessor(
            col_id=df_params['id_col'],
            col_event_time=df_params['order_col'],
            event_time_transformation='none',
            category_transformation = 'frequency',
            cols_category=df_params['cat_cols'],
            cols_discretize={'amount' : ('kmeans', 100)},
            cols_numerical=['amount'],
            return_records=True,
        )

Creating Dask Server
Link Dask Server - http://172.19.0.1:57904/status


Perhaps you already have a cluster running?
Hosting the HTTP server on port 57904 instead


In [25]:
a = preprocessor.fit(data)

In [9]:
preprocessor.unitary_func

{'tr_datetime': Unitary transformation,
 'mcc_code': Unitary transformation,
 'tr_type': Unitary transformation,
 'amount': Unitary transformation}

In [8]:
for ct in preprocessor._all_col_transformers:
    print(type(ct), ct.col_name_original, ct.col_name_target, ct.is_drop_original_col)

<class 'ptls.preprocessing.base.transformation.col_identity_transformer.ColIdentityEncoder'> tr_datetime event_time False
<class 'ptls.preprocessing.pandas.pandas_transformation.pandas_freq_transformer.FrequencyEncoder'> mcc_code mcc_code True
<class 'ptls.preprocessing.pandas.pandas_transformation.pandas_freq_transformer.FrequencyEncoder'> tr_type tr_type True
<class 'ptls.preprocessing.pandas.pandas_transformation.discretizer.ColNumericDiscretizer'> amount amount_cat False
<class 'ptls.preprocessing.base.transformation.col_identity_transformer.ColIdentityEncoder'> amount amount False
<class 'ptls.preprocessing.base.transformation.user_group_transformer.UserGroupTransformer'> client_id client_id False


In [11]:
            cols_discretize={'amount' : ('quantile', 100)},

In [26]:
data1 = preprocessor.transform(data)

In [27]:
data1[0]

{'client_id': 6815,
 'tr_datetime': tensor([ 10.4528,  10.6143,  11.4328,  14.5346,  17.5233,  17.5935,  18.6078,
          23.4900,  32.4673,  32.4973,  34.8502,  35.4594,  35.6383,  42.4708,
          42.7398,  46.3920,  47.5355,  47.6556,  49.6137,  50.4841,  58.3817,
          58.6357,  58.8951,  61.4243,  64.5808,  66.5862,  71.8481,  72.7893,
          73.8133,  73.8301,  74.5201,  76.3236,  81.4415,  82.2787,  82.4238,
          82.5306,  85.1185,  85.4103,  89.5913,  91.3930,  97.4447,  97.5341,
         100.8626, 102.5183, 103.5447, 104.7157, 107.4329, 109.4358, 112.6071,
         116.4056, 119.4254, 122.3550, 123.6778, 124.5905, 127.4590, 133.4434,
         133.5902, 134.4486, 134.5856, 138.5943, 141.8663, 142.8634, 147.6518,
         151.4581, 151.4588, 160.5321, 164.5632, 165.4446, 168.4409, 169.0594,
         169.4997, 172.5965, 174.5953, 178.8272, 183.6073, 184.5805, 184.6087,
         187.5728, 187.6559, 188.4778, 188.4784, 188.4861, 190.4134, 193.5000,
         193.5095

In [34]:
def test_add_replace_col():
    np.random.seed(42)
    num_rows = 1000
    df = pd.DataFrame({
        'id': np.random.randint(1, 4, size=num_rows),
        'event_dt': np.random.randint(1, 100, size=num_rows),
        'num_value': np.random.normal(loc=0, scale=100, size=num_rows)
    })

    n_bins_discr = 10
    discr_type = 'quantile'
    preprocessor =  PandasDataPreprocessor(
            col_id='id',
            col_event_time='event_dt',
            event_time_transformation='none',
            category_transformation = 'none',
            cols_discretize={'num_value' : (discr_type,  n_bins_discr)},
            cols_numerical=['num_value'],
            return_records=True,
        )
    processed = preprocessor.fit_transform(df)

    assert 'num_value' in processed[0], f"Original numeric expected in preprocessed data but not found"
    assert 'num_value_cat' in processed[0], f"Discretized column 'num_value_cat' expected in preprocessed data but not found"

    preprocessor =  PandasDataPreprocessor(
            col_id='id',
            col_event_time='event_dt',
            event_time_transformation='none',
            category_transformation = 'none',
            cols_discretize={'num_value' : (discr_type,  n_bins_discr)},
            return_records=True,
        )
    processed = preprocessor.fit_transform(df)

    assert 'num_value' not in processed[0], f"Original numeric not expected in preprocessed data but found"
    assert 'num_value_cat' in processed[0], f"Discretized column expected in preprocessed data but not found"

In [35]:
test_add_replace_col()

Creating Dask Server
Link Dask Server - http://172.19.0.1:59004/status


Perhaps you already have a cluster running?
Hosting the HTTP server on port 59004 instead


Creating Dask Server


Perhaps you already have a cluster running?
Hosting the HTTP server on port 59011 instead


Link Dask Server - http://172.19.0.1:59011/status




In [23]:
def test_quantile():
    np.random.seed(42)
    num_rows = 10000
    df = pd.DataFrame({
        'id': np.random.randint(1, 4, size=num_rows),
        'event_dt': np.random.randint(1, 100, size=num_rows),
        'num_value': np.random.normal(loc=0, scale=100, size=num_rows)
    })

    n_bins_discr = 100
    preprocessor =  PandasDataPreprocessor(
            col_id='id',
            col_event_time='event_dt',
            event_time_transformation='none',
            category_transformation = 'none',
            cols_discretize={'num_value' : ('kmeans',  n_bins_discr)},
            return_records=True,
        )
    processed = preprocessor.fit_transform(df)
    preproc_cats = torch.cat([x['num_value_cat'] for x in processed]).numpy()

    kbins = KBinsDiscretizer(n_bins=n_bins_discr, encode='ordinal', strategy='kmeans')
    sklearn_cats = kbins.fit_transform(df[['num_value']]).astype(int).flatten()

    preproc_counts = np.bincount(preproc_cats, minlength=n_bins_discr)
    sklearn_counts = np.bincount(sklearn_cats, minlength=n_bins_discr)

    stat, p_value = chisquare(f_obs=preproc_counts, f_exp=sklearn_counts)
    print(f"Chi-square p-value: {p_value:.5f}")

    assert p_value > 0.05, f"Distributions differ significantly (p={p_value:.5f})"

In [24]:
test_quantile()

Creating Dask Server
Link Dask Server - http://172.19.0.1:58808/status


Perhaps you already have a cluster running?
Hosting the HTTP server on port 58808 instead


Chi-square p-value: 1.00000


