In [1]:
# create key value in range style

In [7]:
import bz2
import json
import datetime
import pandas as pd
import glob

def jz2d(f_json):
    with bz2.open(f_json, 'r') as f:
        d = json.load(f)
    return d

def j2d(f_json):
    with open(f_json, 'r') as f:
        d = json.load(f)
    return d



def wdict2sdict(wd):
    sd = pd.DataFrame().from_dict(wd, orient="index")\
            .reset_index()\
            .rename(columns = {"index":"dt"})\
            .assign(dt = lambda df:pd.to_datetime(df.dt),
                    lag_tag = lambda df:df.tag.shift(1, fill_value=""),
                    lag_point = lambda df:df.point.shift(1, fill_value=0))  \
            .assign(is_same_tag = lambda df:df.tag == df.lag_tag,
                    is_same_point = lambda df:df.point == df.lag_point,
                    is_same_two = lambda df:df.is_same_tag & df.is_same_point)\
            .query("not is_same_two")\
            .loc[:,["dt", "tag", "point"]]\
            .set_index("dt")\
            .to_dict(orient= "index")
    return sd

class rangedict():
    # d_meta = dict()
    # d_dt = dict()
    
    def __init__(self, jfile):
        d = j2d(jfile)
        d_dt_whole = d.pop("dt")
        self.d_meta = d.copy()
        self.d_dt = wdict2sdict(d_dt_whole)
        self.d_1st = self.d_dt[next(iter(self.d_dt))]
    
    def get(self, dt):
        #dt = datetime.datetime.strptime(dt, "%Y-%m-%d").date()
        dt = pd.Timestamp(dt)
        res = self.d_1st.copy()
        for k, v in self.d_dt.items():
            if dt < k:
                break
            else:
                res = v.copy()
            
        return res

In [18]:
# f = "../output_iy/drgcd610406013_20230606_124628_792271.json.bz2"
f = "../output_iy/iycode_641140136_20231122_150053_929635.json"

# dx = j2d(f)
rd = rangedict(f)

In [19]:
rd.d_dt

{Timestamp('2010-01-01 00:00:00'): {'tag': 'bdm', 'point': 95.0},
 Timestamp('2012-05-02 00:00:00'): {'tag': 'ws', 'point': 92.0},
 Timestamp('2019-11-02 00:00:00'): {'tag': 'ws', 'point': 94.0},
 Timestamp('2020-10-02 00:00:00'): {'tag': 'aes', 'point': 94.0}}

In [20]:
rd.get("2013-4-1")

{'tag': 'ws', 'point': 92.0}

In [21]:
rd.get("2014-4-10")

{'tag': 'ws', 'point': 92.0}

In [23]:
rd.d_meta

{'drugcode': 641140136,
 'name': 'メチロン注２５％\u3000１ｍＬ',
 'nplcode': '1144400A2014',
 'unit': '管',
 'cat_for_mental': 0.0,
 'cat_neuro_break': 0.0,
 'cat_bio': 0.0,
 'cat_generic': 0.0,
 'dt_start': '2010-01-01',
 'dt_end': '2023-08-31'}

In [24]:
f_drg_meta = "../appendix/dfmeta_iy_20230606_124628_792271.pkl"
df_drg = pd.read_pickle(f_drg_meta)
df_drg.head()

Unnamed: 0,c0,c2,c4,c9,c10,c13,c14,c15,c16,c31,ymd,next_month_day1,next_month_day2,filename,dt_rank_neg
2802,9,610406002,アストニール錠１０　１０ｍｇ,錠,,0.0,0.0,0.0,1.0,2123014F2031,2013-03-29,2013-04-01,2013-04-02,y20130329.csv,1.0
4887,9,610406008,アゼピット錠１ｍｇ,錠,,0.0,0.0,0.0,1.0,4490004F2010,2013-09-30,2013-10-01,2013-10-02,y20130930.csv,1.0
52035,9,610406009,アセメール錠１０　１０ｍｇ,錠,1.0,0.0,0.0,0.0,1.0,2123014F2040,2014-03-05,2014-04-01,2014-04-02,y20140305.csv,1.0
50315,9,610406013,アドメッセン錠１ｍｇ,錠,,0.0,0.0,0.0,1.0,4490004F2010,2015-09-18,2015-10-01,2015-10-02,y20150918.csv,1.0
4888,9,610406015,アナシロール錠１０　１０ｍｇ,錠,,0.0,0.0,0.0,1.0,2123014F2058,2013-09-30,2013-10-01,2013-10-02,y20130930.csv,1.0


In [25]:
fs = glob.glob("../output_iy/*.json.bz2")
fs[:3]

[]

In [13]:
def get_cdrd(f):
    # cd = f.replace("../output_iy/drgcd", "").replace("_20230606_124628_792271.json.bz2","")
    cd = f.replace("../output_iy/drgcd", "").replace("_20230606_124628_792271.json.bz2","")
    rd = rangedict(f)
    return [cd, rd]

In [18]:
import joblib

In [19]:
%%time 
l_res = joblib.Parallel(n_jobs=-1, verbose=5)(joblib.delayed(get_cdrd)(f) for f in fs)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 236 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 488 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 812 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 1208 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done 1676 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 2216 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done 2828 tasks      | elapsed:   29.6s
[Parallel(n_jobs=-1)]: Done 3512 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done 4268 tasks      | elapsed:   44.2s
[Parallel(n_jobs=-1)]: Done 5096 tasks      | elapsed:   53.1s
[Parallel(n_jobs=-1)]: Done 5996 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 6968 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 8012 tasks      | e

CPU times: user 2min 5s, sys: 6.31 s, total: 2min 12s
Wall time: 6min 10s


[Parallel(n_jobs=-1)]: Done 31276 out of 31291 | elapsed:  6.2min remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 31291 out of 31291 | elapsed:  6.2min finished


In [20]:
%%time

d_whole_iy = dict()
for e in l_res:
    d_whole_iy[e[0]] = e[1]

CPU times: user 29.6 ms, sys: 0 ns, total: 29.6 ms
Wall time: 28.1 ms


In [21]:
d_whole_iy["610406008"].get("2012-8-4")


{'tag': 'ws', 'point': 6.2}

In [22]:
d_whole_iy["610406008"].d_meta

{'drugcode': 610406008,
 'name': 'アゼピット錠１ｍｇ',
 'nplcode': '4490004F2010',
 'unit': '錠',
 'cat_for_mental': 0.0,
 'cat_neuro_break': 0.0,
 'cat_bio': 0.0,
 'cat_generic': 1.0,
 'dt_start': '2010-01-01',
 'dt_end': '2023-06-03'}

In [23]:
import sys

In [24]:
sz = sys.getsizeof(d_whole_iy)

In [26]:
print(f'{type(sz)}: {sz}')

<class 'int'>: 1310808
