# Data Preprocessing

## Imports

In [1]:
import pandas as pd
import datatable as dt
import numpy as np

In [2]:
from typing import List
import pickle

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

## Dataset

### Convert CSV to Pickle File

- Reading Pickle file is faster

In [4]:
train_x_dt = dt.fread('dataset/train_x.csv')
train_x_df = train_x_dt.to_pandas()
train_x_df.to_pickle('dataset/train_x.pkl')

In [5]:
test_x_dt = dt.fread('dataset/test_x.csv')
test_x_df = test_x_dt.to_pandas()
test_x_df.to_pickle('dataset/test_x.pkl')

### Load Dataset

In [6]:
train_x = pd.read_pickle('dataset/train_x.pkl')
train_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 685588 entries, 0 to 685587
Columns: 669 entries, ID to 667
dtypes: int32(669)
memory usage: 1.7 GB


In [7]:
test_x = pd.read_pickle('dataset/test_x.pkl')
test_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171397 entries, 0 to 171396
Columns: 669 entries, ID to 667
dtypes: int32(669)
memory usage: 437.4 MB


In [8]:
train_y = pd.read_csv('dataset/train_y.csv')
train_y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404 entries, 0 to 403
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      404 non-null    int64  
 1   y       404 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 6.4 KB


In [9]:
exclude_x = [str(i) for i in np.arange(576, 579)]

In [10]:
train_x = train_x.drop(exclude_x, axis=1)
test_x = test_x.drop(exclude_x, axis=1)

## Summary Trace Data

### summary by statistic

In [13]:
%%time
dataset = [train_x, test_x]
set_name = ['train', 'test']
summary_cand = {
    'min': min,
    'max': max,
    'mean': np.mean,
    'std': np.std,
    'p90': 90,
    'p95': 95,
    'p99': 99,
}

for set_, target in zip(set_name, dataset):
    id_grp = target.groupby('ID')
    res_container = {}
    
    for func_name, action in summary_cand.items():
        if func_name.startswith('p'):
            res = id_grp.agg(
                np.percentile,
                action,
                axis=0,
                method='closest_observation',
            )
        else:
            res = id_grp.agg(action)
            
        res_container[func_name] = res

    with open(f'dataset/summary/{set_}_x_summary.pkl', 'wb') as f:
        pickle.dump(res_container, f, protocol=pickle.HIGHEST_PROTOCOL)

CPU times: total: 24.8 s
Wall time: 25 s
