# Dataset Preview and Conversion

In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet

In [2]:
from os.path import join, realpath
from typing import List

## Load Dataset From CSV Files

In [3]:
train_data = pd.read_csv('../../data/round/21/train_data.csv')
train_data.head()

Unnamed: 0,userId,itemId,Xmin,Ymin,Xmax,Ymax
0,1164,6132,62,656,276,803
1,1399,6132,184,662,235,740
2,1694,6132,63,648,272,830
3,916,6132,111,637,253,831
4,1621,6132,60,651,242,819


In [4]:
train_ans = pd.read_csv('../../data/round/21/train_answers.csv')
train_ans.head()

Unnamed: 0,itemId,Xmin_true,Ymin_true,Xmax_true,Ymax_true
0,5,198,827,649,1340
1,10,26,605,433,847
2,13,233,698,622,920
3,30,61,368,329,530
4,32,30,536,328,813


In [5]:
test_data = pd.read_csv('../../data/round/21/test_data.csv')
test_data.head()

Unnamed: 0,userId,itemId,Xmin,Ymin,Xmax,Ymax
0,1581,34804,86,640,242,743
1,1351,34804,85,655,273,766
2,161,34804,85,648,268,748
3,313,5704,32,618,647,1268
4,1097,5704,25,620,602,1242


## Convert to Parquet Representation

In [6]:
outdir = realpath('../../data/round/21')

In [7]:
fields = [
    pa.field('user_id', pa.int32(), nullable=False),
    pa.field('item_id', pa.int32(), nullable=False),
    pa.field('x_min', pa.int32(), nullable=False),
    pa.field('y_min', pa.int32(), nullable=False),
    pa.field('x_max', pa.int32(), nullable=False),
    pa.field('y_max', pa.int32(), nullable=False),
]

In [8]:
def convert(frame: pd.DataFrame, fields: List[pa.Field], where: str):
    arrays: List[pa.Array] = [pa.array(frame[colname].values) for colname in frame.columns]
    table: pa.Table = pa.Table.from_arrays(arrays, schema=pa.schema(fields))
    pa.parquet.write_table(table=table,
                           where=where,
                           version='2.0',
                           compression='ZSTD')

In [9]:
convert(train_data, fields, join(outdir, 'train_data.parquet'))
convert(train_ans, fields[1:], join(outdir, 'train_ans.parquet'))
convert(test_data, fields, join(outdir, 'test_data.parquet'))