# Shelveの格納チェック
るshelveへのデータ格納状況を確認し、parquet形式へと変換するサンプルスクリプトです。

[1] 既にshelveファイルに格納されているデータをparquet形式に変換
[2] 今後利用するshelvesをparquet形式に保存する

In [2]:
import os
import shelve
import copy
import pandas as pd

print(os.getcwd())

/home/ec2-user/SageMaker/AmazonSageMaker-trolley-monitor-onSOW/utils


In [3]:
# ファイル名の指定
output_path = '../output/Chuo_01_Tokyo-St_up_20230201_knight/'
camera_num = 'HD11'
shelve_name = "/rail.shelve"

In [4]:
%%timeit
# 格納されている名称と画像のリストを確認
with shelve.open(output_path + camera_num + shelve_name, flag='r') as rail:
    print(rail['name'])
    image_list = []
    for img_path in rail[camera_num].keys():
        num_trolleyid = len(rail[camera_num][img_path].keys())
        image_list.append([img_path.split('/')[-1], num_trolleyid])
pd.DataFrame(image_list).head(5)

Chuo_01_Tokyo-St_up_20230201_knight
Chuo_01_Tokyo-St_up_20230201_knight
Chuo_01_Tokyo-St_up_20230201_knight
Chuo_01_Tokyo-St_up_20230201_knight
Chuo_01_Tokyo-St_up_20230201_knight
Chuo_01_Tokyo-St_up_20230201_knight
Chuo_01_Tokyo-St_up_20230201_knight
Chuo_01_Tokyo-St_up_20230201_knight
13.8 s ± 12.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
# 格納パラメータを確認
imagename = 'imgs/Chuo_01_Tokyo-St_up_20230201_knight/HD11/2022_0615_HD11_01_00022313.jpg'

with shelve.open(output_path + camera_num + shelve_name, flag='r') as rail:
    print(f'パラメータ数は{len(rail[camera_num][imagename].keys())}')
    testdict = rail[camera_num]
    for key in rail[camera_num][imagename]['trolley1'].keys():
        print(key)

パラメータ数は3
estimated_upper_edge
estimated_lower_edge
brightness_center
brightness_mean
brightness_std
estimated_upper_edge_variance
estimated_lower_edge_variance
estimated_slope_variance
measured_upper_edge
measured_lower_edge
trolley_end_reason
mask_edgelog_1
mask_edgelog_2


In [8]:
listdf = ['estimated_upper_edge', 'estimated_lower_edge', 'brightness_center', 'brightness_mean', 'brightness_std', 
          'estimated_upper_edge_variance', 'estimated_lower_edge_variance', 'estimated_slope_variance', 'measured_upper_edge',
          'measured_lower_edge', 'trolley_end_reason', 'mask_edgelog_1', 'mask_edgelog_2', 'trolley_id', 'image_path']

df_concat = pd.DataFrame(columns=listdf)
for image_path in testdict.keys():
    for trolleyid in testdict[image_path].keys():
        df = pd.DataFrame.from_dict(testdict[image_path][trolleyid], orient='index').T
        df['image_path'] = image_path
        df['trolley_id'] = trolleyid
        df['x_coordinate'] = [i for i in range(len(df))]
        df_concat = pd.concat([df_concat, df], ignore_index=True)
df_concat.shape

(35817, 16)

In [9]:
df_concat.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35817 entries, 0 to 35816
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   estimated_upper_edge           35817 non-null  object 
 1   estimated_lower_edge           35817 non-null  object 
 2   brightness_center              35817 non-null  object 
 3   brightness_mean                35817 non-null  object 
 4   brightness_std                 35817 non-null  object 
 5   estimated_upper_edge_variance  35817 non-null  object 
 6   estimated_lower_edge_variance  35817 non-null  object 
 7   estimated_slope_variance       35817 non-null  object 
 8   measured_upper_edge            35817 non-null  object 
 9   measured_lower_edge            35817 non-null  object 
 10  trolley_end_reason             1 non-null      object 
 11  mask_edgelog_1                 35817 non-null  object 
 12  mask_edgelog_2                 35817 non-null 

In [10]:
df_concat.head(5)

Unnamed: 0,estimated_upper_edge,estimated_lower_edge,brightness_center,brightness_mean,brightness_std,estimated_upper_edge_variance,estimated_lower_edge_variance,estimated_slope_variance,measured_upper_edge,measured_lower_edge,trolley_end_reason,mask_edgelog_1,mask_edgelog_2,trolley_id,image_path,x_coordinate
0,971.0,998.0,254.0,133.0,98.4375,0.750488,0.750488,0.001049,978.0,989.0,,0.0,0.0,trolley1,imgs/Chuo_01_Tokyo-St_up_20230201_knight/HD11/...,0.0
1,973.0,996.0,254.0,149.125,97.125,0.602051,0.602051,0.001098,978.0,989.0,,0.0,0.0,trolley1,imgs/Chuo_01_Tokyo-St_up_20230201_knight/HD11/...,1.0
2,973.0,995.0,255.0,154.75,95.0,0.504395,0.503906,0.001144,978.0,990.0,,0.0,0.0,trolley1,imgs/Chuo_01_Tokyo-St_up_20230201_knight/HD11/...,2.0
3,974.0,994.0,255.0,166.25,91.875,0.435547,0.435547,0.001188,978.0,990.0,,0.0,0.0,trolley1,imgs/Chuo_01_Tokyo-St_up_20230201_knight/HD11/...,3.0
4,974.0,994.0,255.0,166.125,91.625,0.385498,0.385254,0.001229,978.0,990.0,,0.0,0.0,trolley1,imgs/Chuo_01_Tokyo-St_up_20230201_knight/HD11/...,4.0


In [23]:
df_concat['x_coordinate'] = df_concat['x_coordinate'].astype(int)

In [None]:
df_concat.groupby('trolley_id').count()

In [21]:
for i in df_concat[['image_path', 'trolley_id']].values:
    set(i)

In [None]:
!pip install pyarrow

In [12]:
# Parquetに変換する。
import pyarrow as pa

table = pa.Table.from_pandas(df_concat)

ModuleNotFoundError: No module named 'pyarrow'

In [None]:
# 書き込み
from pyarrow import parquet as pq
pq.write_table(table, 'shelvechenge.parquet', compression=None)
df.to_csv('test.csv.gz', compression='gzip')

In [None]:
%%timeit
# 読み込み
df_parquet = pd.read_parquet('shelvechenge.gzip.parquet')

In [17]:
%%timeit
# 読み込み
df_parquet_nocompression = pd.read_parquet('shelvechenge.parquet')

5.61 ms ± 89.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
%%timeit
df_csv = pd.read_csv('shelvechange.csv')

2.56 ms ± 31.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [19]:
%%timeit
df_csv_gz = pd.read_csv('test.csv.gz')

2.94 ms ± 4.52 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
