In [None]:
!pip install -q -U kaggle
!mkdir -p ~/.kaggle
!cp /home/ec2-user/efs/Various/kaggle.json ~/.kaggle/

In [None]:
!mkdir -p ~/.aws
!cp /home/ec2-user/efs/Various/credentials ~/.aws/credentials 
!cp /home/ec2-user/efs/Various/config ~/.aws/config

In [None]:
packages = ('pytorch_lightning timm==0.6.12 ipywidgets==7.7.1 opencv-python zstandard awscli ' 
                    +  'transformers librosa torchlibrosa torchaudio torchvision ' 
                    + 'lion-pytorch segmentation-models-pytorch==0.3.2 ' 
                    'lightgbm ') # fcwt fsspec[s3] albumentations==1.3.0 
!pip install -q -U $packages

In [None]:
DATA_PATH = '/data/'
TAG = 'tlvmc-parkinsons-freezing-gait-prediction'
BUCKET = 'projects-v'
region = !cat ~/.aws/config | grep region | awk '{print $3}' 
DATA_BUCKET = 'projects-e1' if region[0] == 'us-east-1' else BUCKET if region[0] == 'us-east-2' else '' 
PREFIX = 'walk/'
OFFLINE = False

In [None]:
import zipfile
import boto3
s3 = boto3.client('s3')

import os
import io
from joblib import Parallel, delayed

import json
import pickle
import zstandard as zstd
zd = zstd.ZstdDecompressor()
zc = zstd.ZstdCompressor()

import math
import random
import datetime

import cv2



In [None]:
# !kaggle competitions download -c $TAG -p $DATA_PATH
# !unzip -l $DATA_PATH$TAG".zip"

# # show compressed size of each file in zip and ratio
# with zipfile.ZipFile(DATA_PATH + TAG + '.zip', 'r') as f:
#     for info in f.infolist():
#         print(info.filename, info.compress_size, info.file_size, info.compress_size / info.file_size)
#     files = f.namelist()

# def uploadFileFromZip(f,):
#     s3 = boto3.client('s3')
#     zc = zstd.ZstdCompressor()
#     with zipfile.ZipFile(DATA_PATH + TAG + '.zip', 'r') as z:
#         return s3.put_object(Bucket = BUCKET, 
#                     Key = PREFIX + 'data/' + f + '.zstd', 
#                     Body = zc.compress(z.read(f)))

# r = Parallel(os.cpu_count())(delayed(uploadFileFromZip)(f) for f in files)

In [None]:
# s3 list with paginator
paginator = s3.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket = DATA_BUCKET, Prefix = PREFIX + 'data/')
objs = []
for page in pages:
    for obj in page['Contents']:
        objs.append(obj)

In [None]:
# sum([e['Size'] for e in objs if 'un' in e['Key']])/1e9

In [None]:
# # s3 list with paginator
# paginator = s3.get_paginator('list_objects_v2')
# pages = paginator.paginate(Bucket = DATA_BUCKET, Prefix = PREFIX + 'cache/')
# objs = []
# for page in pages:
#     for obj in page['Contents']:
#         objs.append(obj)

In [None]:
# sorted([e['Key'] for e in objs])[::100]

In [None]:
def getFile(file, save = True):
    s3 = boto3.client('s3')
    zd = zstd.ZstdDecompressor()
    file_path = DATA_PATH + 'data/' + file + '.zstd'
    if not os.path.exists(file_path) or os.path.getsize(file_path) == 0:
        if save: os.makedirs(os.path.dirname(file_path), exist_ok=True)
        fin = s3.get_object(Bucket = DATA_BUCKET, Key = PREFIX + 'data/' + file + '.zstd')['Body']#.read()        
        if save:
            with open(file_path, 'wb') as f: f.write(fin.read())
    out = io.BytesIO()
    if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
        with open(file_path, 'rb') as fin:
            zd.copy_stream(fin, out)
    else:        
        out.write(fin)
    out.seek(0)
    # print(out.getbuffer().nbytes)
    return out

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (7, 4)

from IPython.display import display
np.random.seed(datetime.datetime.now().microsecond)

In [None]:
files = sorted([o['Key'].split('walk/data/')[-1].split('.zstd')[0] for o in objs])
display(files[:10])

events = pd.read_csv(getFile('events.csv'))
subjects = pd.read_csv(getFile('subjects.csv'))
tasks = pd.read_csv(getFile('tasks.csv'))
daily_metadata = pd.read_csv(getFile('daily_metadata.csv'))
defog_metadata = pd.read_csv(getFile('defog_metadata.csv'))
tdcsfog_metadata = pd.read_csv(getFile('tdcsfog_metadata.csv'))
sample = pd.read_csv(getFile('sample_submission.csv'))

In [None]:
QUEUE_NAME = 'walk'
sqs = boto3.client('sqs')
sqs.create_queue(QueueName = QUEUE_NAME)
queue_url = sqs.get_queue_url(QueueName = QUEUE_NAME)['QueueUrl']

In [None]:
# 25k w/0.03wd, 50k w/0.1 wd
all_params = [
    # [{}] * 1,
    # [{'melspec': True, 'n_mels': 16, 'mel_pwr': 0.3}],
    # [{'melspec': True, 'n_mels': 16, 'mel_pwr': None}], 

    # [{'seg': True, 'xformer_layers': 0, 'encoder': 'tu-mobilevitv2_100', 
    #     'melspec': True, 'n_mels': 16, }],
    
    [{'xformer_layers': l, 'xformer_init_scale': 0.7, 'rel_pos': e}
            for e in ['mlp', None, ]#'bias']
            for l in [ 3, 4, 5,]],
    [{'xformer_layers': l, 'deberta': True,
        'xformer_init_1': 1., 'xformer_init_2': 1,
       'xformer_init_scale': 0.7,}
            for l in [3, 4, 5]],
]
def flatten(l): return [item for sublist in l for item in sublist]
all_params = flatten(all_params)
len(all_params), all_params

In [None]:
all_params = flatten([all_params] * 2)
all_params = [p.copy() for p in all_params]
# all_params = random.sample(all_params, 8)
for i, p in enumerate(all_params):
    # p = p.copy()
    p.update( {#'steps': random.randint(25000, 35000),
               'step_mult': random.choice([2, 3,]),
               'batch_size': random.choice([12, 16, 20, 24,]),

                'seq': random.choice([192, 224, 256, 288, 320, 384, ]),
                'patch': random.choice([8, 9, 10, 11, 12, 13 ]),
               
                'alibi': random.choice([0, 2, 4, 8]),
                'lion': random.choice([True, False, False]),
                'lr': random.choice([0.3e-4, 0.5e-4, 0.7e-4, 1e-4, 2e-4, 3e-4, 5e-4,]),
                'weight_decay': round(0.05 * np.exp(np.random.normal(0, 1)), 3),
                'frac_pwr_mult': round(np.exp(np.random.normal(0.7, 0.1)), 2),
                'frac_rand': round(random.random(), 2), 
                'stretch_rate': random.choice([0.3, 0.5, 0.7,]),
                'dims': random.choice([256, ]),
                'act_layer': random.choice(['GELU', 'GELU', 'GELU', 'PReLU', 'CELU']),
                'dropout': random.choice([0.1, 0.15, 0.2, 0.25, 0.3]),
                'focal_alpha': random.choice([0.1, 0.25, 0.25]),
                'focal_gamma': random.choice([1.5, 1.5, 2., 2.5 ]),
                'patch_act': random.choice(['Identity', 'Identity', 'Identity', 'Identity',                                             
                                            'PReLU', 'PReLU', 'PReLU',
                                            'GELU', 'GELU', 'GELU', 'CELU', 
                                            'Tanh', 'Tanh', 
                                            'LeakyReLU', 'LeakyReLU', 'LeakyReLU',
                                              ]),
                'rnn': random.choice([None, ] + ['GRU'] * 5 +['LSTM'] + ['GRU']),
                'se_dims': random.choice([0, 8, 16]),
                'frac_se': False, #random.choice([True, False]),
                'len_se': False, # random.choice([False,]),
                'm_se': True, # random.choice([True, ]),
                'se_dropout': random.choice([0.2, 0.25, 0.3, ]),
                'se_pact': random.choice([0., ]),
                # encodes only defog vs tdcsfog


                'fast_mult': random.choice([1, 1, 1, 1, 0.5, 0.3, ]),
                'final_mult': random.choice([2, 4, 4, 6 ]),
                'pre_norm': random.choice([True, False]),

                '0x2d57c2': random.choice(['22', '21', '12', ]),
                '0xe86b6e': random.choice(['12', '12', '11']),
                'fix_final': random.choice([True, True, False, ]),
                'mae_divisor': random.choice([1, 2, 5, 10, ]),

                'aux_wt': random.choice([0.]),
                'v_wt': random.choice([0.03, ]),                
                'min_wt': random.choice([3e-3, 0.01,]),
                
                'frac_adj': random.choice([True, False]),
                'm_adj': random.choice([True, True, True, False]),
                
                'adj_gn': random.choice([0.3, 0.5,]),
                'm_adj_gn': random.choice([0.1, 0.2]),

                'len_adj': random.choice([True, False, ]),                

                'folds': random.choice(['A', 'B',]),# 'C', 'D'])
                'patch_dropout': random.choice([0, 0, 0.,  
                                                0.05, 0.1, 0.15, 0.2, 0.25]),
                # 'frac_gn': random.choice([0., 0.03, 0.1]), 

                'expanded': random.choice([False, False, False]),
            })
    

In [None]:
for p in all_params:
    if p['pre_norm']:
        p['xformer_init_1'] = 1.
        p['xformer_init_2'] = 1.
        p['xformer_init_scale'] = 0.7
        
    # if random.random() < 1/10 and p['rnn'] is None:
    #     p['xformer_layers'] = 0
    
    if random.random() < 1:
        p['dims'] = 384
        p['nheads'] = 12
        p['final_mult'] = 4

    if random.random() < 1:
        p['xformer_attn_drop_rate'] = 0.
        p['xformer_drop_path_rate'] = 0.

In [None]:
RELABEL = True
if RELABEL:
    for p in all_params:
        p['relabel'] = True
        p['batch_size'] = 12
        p['steps'] = random.choice([40000, 30000 ])
        p['focal_alpha'] = 0.25
        p['focal_gamma'] = 1.
        p['neg_mult'] = random.choice([0.01, 0.03, 0.1, ])
        if p['seq'] == 192:
            p['seq'] *= 2

In [None]:
# import random
# random.seed(datetime.datetime.now().microsecond)
# random.shuffle(all_params)
# # sqs.purge_queue(QueueUrl = queue_url)
# N_FOLDS = 4
# MAX = 2
# base_seed = random.randint(0, 100000)
# for i, p in enumerate(all_params[::-1][:MAX]):
#     for s in range(1):
#         for fold in range(N_FOLDS):
#             p['n_folds'] = N_FOLDS
#             p['fold'] = fold
#             p['seed'] = base_seed + i 
#             sqs.send_message(QueueUrl = queue_url, MessageBody = json.dumps(p))


In [None]:
# # s3 delete all objects beggiing with a certain prefix
# s3r = boto3.resource('s3')
# bucket = s3r.Bucket(BUCKET)
# bucket.objects.filter(Prefix = PREFIX + 'spreds/').delete()

In [None]:
# s3.list_objects(Bucket = BUCKET, Prefix = PREFIX + 'spreds/')


In [None]:

# sqs.delete_message(QueueUrl = queue_url, ReceiptHandle = rh)
# sqs.send_message(QueueUrl = queue_url, MessageBody = json.dumps(eparams))


In [None]:
# eparams = random.choice(all_params)
# eparams['patch']
# eparams['seed'] = random.randint(0, 100000)
# eparams['n_folds'] = 4
# eparams['fold'] = 0#random.randint(0, 2)
# eparams['folds'] = 'A'

In [None]:
try:
    msg = sqs.receive_message(QueueUrl = queue_url, WaitTimeSeconds = 3,
                                VisibilityTimeout = 90 * 60,                            
                              )['Messages'][0]
    rh, body = [msg[e] for e in ['ReceiptHandle', 'Body']]
    eparams = json.loads(body)
except Exception as e:
    print(e)

In [None]:
params = eparams.copy()
display(params)


In [None]:
# params = results[0]['params']

In [None]:

FOLD, SEED = params['fold'], params['seed']
N_FOLDS = params.get('n_folds', 3)

In [None]:
def prep_metadata(defog_metadata, tdcsfog_metadata, daily_metadata, subjects, full = True,
                    expanded = True):
    m1 = defog_metadata.copy()
    m1.insert(3, 'Test', 0)
    m1 = m1.merge(subjects, on = ['Subject', 'Visit', ], how = 'inner')
    assert len(m1) == len(defog_metadata)

    m2 = tdcsfog_metadata.copy()
    m2 = m2.merge(subjects.drop(columns = 'Visit'), on = ['Subject', ], how = 'inner')
    assert len(m2) == len(tdcsfog_metadata)

    m3 = daily_metadata.copy()
    m3 = m3.merge(subjects, on = ['Subject', 'Visit' ], how = 'inner')
    m3.insert(3, 'Test', 0)
    m3.insert(4, 'Medication', (defog_metadata.Medication == 'on').mean())
    m3.drop(columns = [c for c in m3.columns if 'recording' in c], inplace = True)
    assert len(m3) == len(daily_metadata)

    metadata = pd.concat([m1, m2, #m3
                            ], axis = 0)
    metadata.Medication = 1 * (metadata.Medication == 'on')
    metadata.Sex = 1 * (metadata.Sex == 'M')
    
    if expanded:
        metadata['num_tests'] = metadata.groupby('Subject').transform(lambda x: x.nunique()).Test
        metadata['max_visit'] = metadata.groupby('Subject').transform(lambda x: x.max()).Visit
        metadata['visit_medications'] = metadata.groupby(['Subject', 'Visit']).transform('nunique').Medication 
        metadata['UPDRS_On_vs_Off'] = metadata.UPDRSIII_On - metadata.UPDRSIII_Off
        # add 4 columns to dmetadata
    

    if full:
        # null fix
        metadata['Uon_null'] = 1 * (metadata.UPDRSIII_On.isnull())
        metadata['Uoff_null'] = 1 * (metadata.UPDRSIII_Off.isnull())
        metadata.UPDRSIII_On = metadata.UPDRSIII_On.fillna(metadata.UPDRSIII_On.mean())
        metadata.UPDRSIII_Off = metadata.UPDRSIII_Off.fillna(metadata.UPDRSIII_Off.mean())
        if expanded:
            metadata.UPDRS_On_vs_Off = metadata.UPDRS_On_vs_Off.fillna(metadata.UPDRS_On_vs_Off.mean())   

    metadata.set_index('Id', inplace = True)
    
    if full:
        metadata['Test_Nonzero'] = 1. * (metadata.Test > 0)
        # for i in range(1):
        #     metadata['Test{}'.format(i)] = metadata.Test == i
        metadata.iloc[:, 1:] = metadata.iloc[:, 1:].astype(np.float32)
        metadata.iloc[:, 1:] = (metadata.iloc[:, 1:] - metadata.iloc[:, 1:].mean(0)) / metadata.iloc[:, 1:].std(0)  
        metadata.iloc[:, 1:] = metadata.iloc[:, 1:].clip(-3, 3)

    msubject = metadata.Subject
    metadata.drop(columns = 'Subject', inplace = True)
    if full:
        metadata = metadata.astype(np.float32)
    
    m3 = m3.set_index('Id')
    assert m3.shape[1] <= metadata.shape[1]; i = 0
    while m3.shape[1] < metadata.shape[1]:
        m3.insert(m3.shape[1], 'dummy_{}'.format(i), 0); i += 1
    m3.iloc[:, -1] = metadata.iloc[:, -1].min() # yes, hack, for default_metadata in dataset.py 

    return metadata, msubject, m3

In [None]:
metadata, msubject, dmetadata = prep_metadata(defog_metadata, tdcsfog_metadata, daily_metadata, subjects,
                                   expanded = params['expanded'])

In [None]:
assert metadata.shape[1] == dmetadata.shape[1]

In [None]:
downsample = {k[2:]: v for k, v in params.items() if k.startswith('0x')}
display(downsample)

drop_ids = []; id_frac = {}
for k, v in downsample.items():
    print(k, v)
    df = metadata[msubject.loc[metadata.index] == k]
    ids = df.sample(frac = 1 - 1 / int(v[0]), random_state = SEED).index.tolist()
    drop_ids.extend(ids); print(ids)

    ids = {i: 1 / int(v[1]) for i in df.index}
    id_frac.update(ids); print(ids)
    print()
# common_
# COMMON = ['2d57c2', 'e86b6e'][:params['sans']]; print(COMMON)
# common_events = msubject.reindex(events.Id).isin(COMMON ).values 
# common_events.sum()

In [None]:
# # test_downsample = {k[2:]: v for k, v in params.items() if k.startswith('0x')}
# # display(downsample)

# test_id_frac = {}
# for k, v in {'2d57c2': '18', 'e86b6e': '12'}.items():
#     print(k, v)
#     df = metadata[msubject.loc[metadata.index] == k]
#     ids = {i: 1 / int(v[1]) for i in df.index}
#     test_id_frac.update(ids); print(ids)
#     print()
# # test_id_frac

In [None]:
drop_events = events.Id.isin(drop_ids); print(drop_events.sum())
event_frac = events.Id.map(id_frac).fillna(1); print((event_frac < 1).sum())

In [None]:
if params['folds'] in ['A', 'D']:
    # pd cut -- size
    events['Length'] = (events.Completion - events.Init)
    random.seed(SEED)
    events['Q'] = events.groupby('Type')['Length'].transform( 
        lambda x: pd.cut(x, (10 if params['folds'] == 'A'
                                else random.randint(5, 15)
                             ) ** np.arange(0, 10), labels = False)).fillna(0)
    assert events.Q.isnull().sum() == 0
    random.seed(datetime.datetime.now().microsecond)
    display(events.groupby(['Type', 'Q']).Length.agg(['mean', 'count', 'sum']))

elif params['folds'] in ['B', 'C']:
    # pd cut -- binary
    events['Length'] = (events.Completion - events.Init) #* (~drop_events) * event_frac
    random.seed(SEED)
    events['Q'] = events['Length'].transform( 
        lambda x: pd.cut(x, [-1, 5 + 10 * random.random(), 1000000], labels = False) )#.fillna(0)
    assert events.Q.isnull().sum() == 0
    random.seed(datetime.datetime.now().microsecond)
    display(events.groupby(['Type', 'Q']).Length.agg(['mean', 'count', 'sum']))

In [None]:
ev = sorted(events.Type.dropna().unique())[::]; print(ev)
f = ~events.Type.isnull() & (events.Type != 'Turn') & (events.Length > 0.5) & ~drop_events
ef = events[f].set_index('Id')
etables = []
s = ef.Type.map(dict(zip(ev, range(1, 1 + len(ev))))) * 100 + 10 * ef.Q

# if params['folds'] in ['B', 'C', ]:
for t, v in zip(ef.itertuples(), s):
    etables.extend([(t.Index, v)] * (
        int(round(t.Length) ** (0.5 if params['folds'] in ['C', 'D'] else 1))
                if params['folds'] in ['B', 'C', 'D'] else 1
                ))   

etable = pd.Series(*list(zip(*etables))[::-1])
etable = pd.concat((etable, #*([etable[etable % 100 > 0]] * 3), *([etable[etable % 100  > 10]] * 10), 
                    pd.Series(0, list(set(metadata.index) - set(etable.index)))))
etable = (etable + 1 * etable.index.isin(tdcsfog_metadata.Id)).astype(int)
esubject = msubject.reindex(etable.index)
etable.value_counts()


In [None]:
from sklearn.model_selection import StratifiedGroupKFold

folds = list(StratifiedGroupKFold(n_splits = N_FOLDS, 
                                shuffle = True, random_state = SEED
        ).split(np.zeros(len(etable)), etable, groups = esubject))
train_fold, test_fold = folds[FOLD]
train_ids = etable.iloc[train_fold].index
test_ids = etable.iloc[test_fold].index
assert set(msubject.loc[train_ids]) & set(msubject.loc[test_ids]) == set()


In [None]:
print(len(train_ids), len(test_ids))

train_subjects = msubject.loc[train_ids].unique()
test_subjects = msubject.loc[test_ids].unique()
assert set(train_subjects) & set(test_subjects) == set()
print(len(train_subjects), len(test_subjects)) 

# ( etable[etable.index.isin(train_ids)].value_counts(), 
#  etable[etable.index.isin(test_ids)].value_counts())

In [None]:
train_df = daily_metadata[daily_metadata.Subject.isin(list(train_subjects))]
test_df = daily_metadata[~daily_metadata.Subject.isin(list(train_subjects))]

train_daily_ids, test_daily_ids = train_df.Id.tolist(), test_df.Id.tolist()
train_daily_subjects, test_daily_subjects = train_df.Subject, test_df.Subject
assert set(train_daily_ids) & set(test_daily_ids) == set()
assert set(train_daily_subjects) & set(test_daily_subjects) == set()
print(len(train_daily_ids), len(test_daily_ids))
print(len(train_daily_subjects), len(test_daily_subjects))

In [None]:
s2 = subjects[subjects.Subject.isin(daily_metadata.Subject) & (subjects.NFOGQ == 0)].Subject.unique()
assert len(set(s2) & set(train_daily_subjects)) == 0

In [None]:
fog_files = [f for f in files if 'train/' in f and 'fog' in f]# and not any([z in f for z in common_files])]
print(len(fog_files))

unlabeled_files = [f for f in files if 'unlabeled/' in f]# and not any([z in f for z in common_files])]
print(len(unlabeled_files))

In [None]:
task_dict = dict(zip(sorted(tasks.Task.unique()), np.arange(1, 1 + len(tasks.Task.unique()))))
len(task_dict)
# task_dict

In [None]:
# load and process data
def load(f):
    # zd = zstd.ZstdDecompressor()
    # with open(DATA_PATH + 'cache/' + f + '.zstd', 'rb') as f:
    #     return pickle.loads(zd.decompress(f.read()))   
    return np.load(DATA_PATH + 'cache/' + f + '.npy')

def process(f):
    # if exists, return stats
    cache_file = DATA_PATH + 'cache/' + f + '.npy'
    if os.path.exists(cache_file) and os.path.getsize(cache_file) > 0: 
        return load(f).shape, os.path.getsize(cache_file)
    
    # if not, load array,
    df = pd.read_csv(getFile(f, ), )

    # verify
    assert (df.Time == df.index).all()
    assert all(df.columns == ['Time',
                           'AccV', 'AccML', 'AccAP',
                           'StartHesitation', 'Turn', 'Walking',
                            'Valid', 'Task'][:len(df.columns)])
    assert len(df.columns) in [7, 9]
    
    v = np.zeros((len(df), 12), dtype = np.float32)
    v[:, 6:8] = 1
    v[:, :df.shape[1] - 1] = df.iloc[:, 1:]

    fid = f.split('/')[-1].split('.')[0]
    mult = 100 if 'tdcs' not in f else 128
    for e in events[events.Id == fid].itertuples():
        v[int(round(e.Init * mult)): int(round(e.Completion * mult)), 8] = 1
        v[int(round(e.Init * mult)): int(round(e.Completion * mult)), 9] = e.Kinetic
        v[int(round(e.Init * mult)): int(round(e.Completion * mult)), 10] = 1 - e.Kinetic
    for e in tasks[tasks.Id == fid].itertuples():
        v[int(round(e.Begin * mult)): int(round(e.End * mult)), 11] = task_dict[e.Task]

    # store as compresssed;
    assert v.dtype == np.float32
    # zc = zstd.ZstdCompressor()
    # compr = zc.compress(pickle.dumps(v))
    os.makedirs(os.path.dirname(cache_file), exist_ok = True)
    np.save(cache_file, v)
    # with open(cache_file, 'wb') as f:
    #     f.write(compr)

    return v.shape, os.path.getsize(cache_file)#len(compr)


In [None]:
from collections import defaultdict
sz = 0
epreds = []#defaultdict(list)
paginator = s3.get_paginator('list_objects_v2')
for page in paginator.paginate(Bucket = BUCKET, Prefix = PREFIX + 'epreds/'):
    for obj in page['Contents']:
        epreds.append(obj['Key'].split('/')[-1].split('.')[0])
len(epreds)

In [None]:
def clip_ereds(x):
    # x = np.concatenate([x, np.zeros((x.shape[0], 1))], 1)
    savg = pd.Series(x[:, -1]).rolling(200, center = True, min_periods = 1).mean()
    x[:, -1:] -= max(0.2, np.quantile(savg, 0.15))
    x = x.clip(0, None) #+ 0.01
    return x


In [None]:
%%time
while True:
    x = pickle.loads(zstd.decompress(
        s3.get_object(Bucket = BUCKET, 
            Key = PREFIX + 'epreds/' + random.choice(epreds) + '.pkl.zstd')['Body'].read()
    ))
    x = clip_ereds(x)
    srate = 10 * x[:, -1].mean()
    
    if random.random() > srate: continue;    
    print(100 * srate); 
    plt.plot(x[:5000, ], alpha = 0.5, linewidth = 2);
    plt.ylim(0, 1.)
    break;


In [None]:
def load_epreds(ep):
    s3 = boto3.client('s3')
    return pickle.loads(zstd.decompress(
        s3.get_object(Bucket = DATA_BUCKET, 
            Key = PREFIX + 'epreds/' + random.choice(epreds) + '.pkl.zstd')['Body'].read()
    ))

In [None]:
DAILY_SPLIT = 100000 #if not RELABEL else 50000

def processDaily(f, relabel = False, ):
    cache_file = DATA_PATH + 'cache/{}_{:05d}'.format(f, 100) + '.npy'
    if os.path.exists(cache_file) and os.path.getsize(cache_file) > 0: 
        return os.path.getsize(cache_file)
    
    df = pd.read_parquet(getFile(f), columns  = ['AccV', 'AccML', 'AccAP']
                                ).astype(np.float32).round(3)
    os.remove(DATA_PATH + 'data/' + f + '.zstd')
    
    assert all(df.columns == ['AccV', 'AccML', 'AccAP',])
    assert df[::100].std().mean() > 0.1
    v = df.values

    maxlen = DAILY_SPLIT
    zc = zstd.ZstdCompressor()
    os.makedirs(os.path.dirname(cache_file), exist_ok = True)
    klast = None; plast = None    
    for i in range(0, math.ceil(len(v) / maxlen)):
        vsplit = v[i * maxlen:(i + 1) * maxlen]
        if relabel:
            k = '{}_{:05d}'.format(unlabeled_files[0].split('/')[-1].split('.')[0], i // 10) 
            if k != klast:
                # print(k)
                labels = load_epreds(k)
                labels = clip_ereds(labels)
                labels = cv2.resize(labels, None, fx = 1, fy = 10)
                klast = k; plast = labels
            vsave = np.concatenate([vsplit, plast[i % 10 * (maxlen)
                                                    : (i % 10 + 1) * (maxlen )]
                                                [:len(vsplit)]], 1)
            # print(vsave.shape, vsave.std(0)[3:])
        cache_file = DATA_PATH + 'cache/{}_{:05d}'.format(f, i) + '.npy'
        # compr = zc.compress(pickle.dumps(vsave))
        np.save(cache_file, vsave.astype(np.float16))

        # with open(cache_file, 'wb') as fc:
            # fc.write(compr)
    return v.shape

In [None]:
def loadDaily(f, i0, i1, verbose = False):
    vs = []
    # print(f, i0, i1)
    if 'unlabeled/' not in f: f = 'unlabeled/' + f
    if 'cache/' in f: f = f.replace('cache/', '')  
    if '.parquet' not in f: f = f + '.parquet'
    # print(f)
    cmin, cmax = i0 // DAILY_SPLIT, (i1 - 1) // DAILY_SPLIT
    for i in range(cmin, cmax + 1):
        cache_file = DATA_PATH + 'cache/{}_{:05d}'.format(f, i) + '.npy'
        if verbose: print(cache_file)
        if not os.path.exists(cache_file): break;
        
        with open(cache_file, 'rb') as fc:
            vs.append(
                # pickle.loads(zd.decompress(fc.read()))
                np.load(cache_file)#.astype(np.float32)
                )
    v = (vs[0] if len(vs) == 1 else np.concatenate(vs)).astype(np.float32)
    return v

In [None]:
# !rm -r /data/cache

In [None]:
# %%timeit -n 1
# loadDaily(unlabeled_files[0], 0, 100000).shape

In [None]:
# %%timeit -n 10
# loadDaily(unlabeled_files[0], 0, 1000)

In [None]:
# # processDaily(unlabeled_files[0], RELABEL)
# x = loadDaily(unlabeled_files[0], 2100000, 2190000)
# plt.plot(x[:, :3], alpha = 0.5, linewidth = 0.3);
# plt.plot(x[:, 3:] * 5);

In [None]:
# # compile dailies
# # paginator for s3, list_objects_v2, call
# from collections import defaultdict
# sz = 0
# label_objs = defaultdict(list)
# paginator = s3.get_paginator('list_objects_v2')
# for page in paginator.paginate(Bucket = BUCKET, Prefix = PREFIX + 'spreds/'):
#     for obj in page['Contents']:
#         if (obj['Key'].endswith('.pkl.zstd')
#             and obj['LastModified'] > #datetime.datetime(2023, 6, 8, 12, 0)):
#                 datetime.datetime(2023, 6, 8, 13, 0, tzinfo=datetime.timezone.utc)):
#             label_objs[obj['Key'].split('/')[-1].split('.')[0]].append(obj['Key'])
#             sz += obj['Size']
#         else:
#             # delete
#             # s3.delete_object(Bucket = BUCKET, Key = obj['Key'])
#             print(obj['LastModified'])#x')
#     # break;

In [None]:
# lens = [len(v) for v in label_objs.values()]
# len(lens), sum(lens), max(lens), min(lens), np.mean(lens), np.median(lens)

In [None]:
# max([len(set([e.split('/')[-2] for e in o]))
#         for o in label_objs.values()])

In [None]:
# full_groups = [(k, v) for k, v in label_objs.items() 
#                if len(v) >= 72/4
#                and k not in epreds
#                ]
# len(full_groups)

In [None]:
# def load_obj(k):
#     s3 = boto3.client('s3')
#     return pickle.loads(zstd.decompress(
#         s3.get_object(Bucket = BUCKET, Key = k)['Body'].read()
#     ))

# def compile_preds(k, v):
#     s3 = boto3.client('s3')
#     r = [load_obj(k) for k in v]
#     assert len(r) >= 72/4
#     avg = np.stack(r).mean(0).round(3)
#     s3.put_object(Bucket = BUCKET, Key = PREFIX + 'epreds/' + k + '.pkl.zstd',
#                 Body = zstd.compress(pickle.dumps(avg)))
#     return len(r)

In [None]:
# r = Parallel(n_jobs = os.cpu_count() * 3)(
#     delayed(compile_preds)(k, v) for k, v in full_groups[:])

In [None]:
# !sudo apt-get install zip
# !zip -r backup/backup-inprogress2.zip *.py *.ipynb

In [None]:
%%time
# process cache
r = Parallel(n_jobs = 3)(delayed(processDaily)(f, RELABEL) 
    for f in unlabeled_files 
    if any([z in f for z in ( train_daily_ids if RELABEL
            else random.sample(train_daily_ids, k = min(10, len(train_daily_ids))) )
                             + test_daily_ids])
    ) 

In [None]:
# get the last file (sorted), based on prefix prior to ., for a list
uc = 'cache/unlabeled/'
f = sorted([f for f in os.listdir(DATA_PATH + uc)
                        # if any([z in f for z in daily_ids])
                ], )#key = lambda x: x.split('.')[0].split('_')[-1])
ucount = {}
for e in f: ucount[uc + e.split('.')[0]] = (int(e.split('_')[-1].split('.')[0]) + 1) * DAILY_SPLIT
if RELABEL: assert len(ucount) == len(unlabeled_files) 
sum(ucount.values()) / 1e6

In [None]:
!du -sh {DATA_PATH}cache/unlabeled*

In [None]:

# process all files
r = Parallel(os.cpu_count())(delayed(process)(f) for f in fog_files[:])

# display counts;
lcount = dict(zip(fog_files, [e[0][0] for e in r]))
[sum([v for k, v in lcount.items() if s in k]) / 1e6
        for s in ['/defog/', '/tdcsfog/', ]]

In [None]:
!du -sh {DATA_PATH}cache/train*

In [None]:
PATCH = params.get('patch', 12)
SEQ = params.get('seq', 256)
PATCH * SEQ / 100

In [None]:
%run -i dataset.py

In [None]:
train_data = WalkDataset(    
    {k: v for k, v in lcount.items()
                if k.split('/')[-1].split('.')[0] in train_ids
                and k.split('/')[-1].split('.')[0] not in drop_ids
                }, 
                metadata, load, loadDaily, -1, id_frac = id_frac,
                test = False, **getParams(WalkDataset, params))
pure_train_data = train_data

test_data = WalkDataset(
    {k: v for k, v in lcount.items()
                if k.split('/')[-1].split('.')[0] in test_ids},
                  metadata, load, loadDaily, -1,
                  test = True,# id_frac = test_id_frac,
                  **getParams(WalkDataset, params))
train_daily_data = WalkDataset(
    {k: v for k, v in ucount.items()
                if k.split('/')[-1] in train_daily_ids},
                dmetadata if RELABEL else metadata, load, loadDaily, DAILY_SPLIT,
                test = False,
                **getParams(WalkDataset, params))
test_daily_data = WalkDataset(
    {k: v for k, v in ucount.items()
                if k.split('/')[-1] in test_daily_ids},
                metadata, load, loadDaily, DAILY_SPLIT,
                test = True,
                **getParams(WalkDataset, params)) 

len(train_data), len(test_data), len(train_daily_data), len(test_daily_data)

In [None]:
class ComboDataset(Dataset):
    ''' combines two datasets, all of the first one 
        plus a fraction of the second one, specified as pct of size of first one

        always use random idxs for the second one, so that it's not biased

    '''
    def __init__(self, d1, d2, d2_frac = 0.5):
        self.d1 = d1
        self.d2 = d2
        self.d2_frac = d2_frac
        self.d1_len = len(d1)
        self.d2_len = int(self.d1_len * d2_frac)
        self.len = self.d1_len + self.d2_len
        self.d1_len, self.d2_len, self.len
        self.d2_idxs = np.random.choice(len(d2), self.d2_len, replace = False)

    def __len__(self):
        return self.len
    
    def __getitem__(self, idx):
        if idx < self.d1_len:
            return self.d1[idx]
        else:
            return self.d2[self.d2_idxs[idx - self.d1_len]]
        

In [None]:
class CosineMixDataset(Dataset):
    def __init__(self, datasetA, datasetB, n = None):
        self.datasetA = datasetA
        self.datasetB = datasetB
        self.n = n or max(len(self.datasetA), len(self.datasetB))        
        # self.current_idx = 0
        self.aidxs = np.arange(len(self.datasetA))
        self.bidxs = np.arange(len(self.datasetB))
        np.random.shuffle(self.aidxs)
        np.random.shuffle(self.bidxs)

    def __len__(self):
        return self.n
    
    def __getitem__(self, idx):        
        # self.current_idx += 1
        if random.random() < 0.5 * (1 + np.cos(min(1, idx / self.n) * np.pi)):
            return self.datasetA[self.aidxs[idx % len(self.datasetA)]]
        else:
            return self.datasetB[self.bidxs[idx % len(self.datasetB)]]

In [None]:
if RELABEL:
    train_data = CosineMixDataset(train_daily_data, pure_train_data,
                                    params['batch_size'] * params['steps']) 

else:
    train_data = ComboDataset(train_data, train_daily_data, 0.2)

len(train_data)

In [None]:
x, y, s, frac = train_data[random.choice(range(len(train_data)))][:4]


In [None]:
%run -i model.py

In [None]:
model = WalkModule(params, **getParams(WalkModule, params)).to(device)

In [None]:
print(params)
model.train()
from torch.utils.data._utils.collate import default_collate
x, y, s, frac, m, f, i, flen = default_collate(
    [train_data[random.choice(range(len(train_data)))] for i in range(32)])

yp = model(x.to(device),
            m.to(device),
            frac.to(device),
            flen.to(device),
            # adjust = False
            )[0]
loss = yp.mean()
loss.backward()
print(yp.mean().item(), yp.shape)

In [None]:
{k: v for k,v in params.items() if ('wt' in k) and '_' in k}

In [None]:
{k: v for k,v in params.items() if ('se' in k or 'frac' in k or 'adj' in k or 'm_' in k) and '_' in k}

In [None]:
train_loader = DataLoader(train_data, batch_size = params['batch_size'], 
                            shuffle = True if not RELABEL else False,
                            drop_last = True,
                              num_workers = os.cpu_count())
  
test_loader = DataLoader(test_data, batch_size = 32,
                          num_workers = os.cpu_count())
test_daily_loader = DataLoader(test_daily_data, batch_size = 32,
                          num_workers = os.cpu_count())

len(train_loader), len(test_loader), len(test_daily_loader)

In [None]:
if RELABEL:
    model.params.steps = len(train_loader)
else:
    model.params.steps = int(params['step_mult'] * (len(train_data) * len(train_loader)) ** 0.5)

model.params.steps

In [None]:
# and show progress bar every 10 steps
m = secrets.token_hex(3); m
trainer = pl.Trainer(accelerator = 'auto', logger = False, 
                     precision = 16,
                     gradient_clip_val = 1,
                        enable_checkpointing = False,
                        val_check_interval = model.params.steps,
                        check_val_every_n_epoch = None,
                        max_steps = model.params.steps,
                        callbacks = [pl.callbacks.TQDMProgressBar(refresh_rate = 5)],
                        )


In [None]:
trainer.fit(model, train_loader, test_loader) 

In [None]:
N_TARGETS = 4
STORE_SUBSAMPLE = 10
STORE_SPLIT = 100000

from collections import defaultdict

def stack_store(yps, fs, idxs):
    _fs = np.concatenate(fs)
    _idxs = torch.cat(idxs).cpu().numpy()    
    _yps = torch.cat(yps).cpu().numpy()
    _yps[..., 3] = _yps[..., 3:5].mean(-1)
    _yps = _yps[..., :N_TARGETS]        

    assert all([len(e) == len(_yps) for e in [_fs, _idxs, _yps]])
    return _yps, _fs, _idxs

In [None]:
def process_store(_yps, _fs, _idxs, pred_dict, ct_dict, age_dict):        
    for i in range(len(_yps)):
        ridxs = np.arange(_idxs[i], _idxs[i] + len(_yps[i]))
        for ig in np.unique(ridxs // STORE_SPLIT):
            k = (_fs[i], ig )
            _ = ridxs // STORE_SPLIT == ig
            aidx = ridxs[_] % STORE_SPLIT
            # print(k, _yps[i].shape, aidx.shape, )
            pred_dict.setdefault(k, np.zeros((STORE_SPLIT, N_TARGETS), dtype = np.float32))
            ct_dict.setdefault(k, np.zeros((STORE_SPLIT, N_TARGETS), dtype = np.float32))
            pred_dict[k][aidx] += _yps[i][_] 
            ct_dict[k][aidx] += np.ones_like(_yps[i][_])
            age_dict[k] = 0

In [None]:
from threading import Thread

def s3_push(b, k, body):
    s3.put_object(Bucket = b, Key = k, Body = body)
    
def flush_store(pred_dict, ct_dict, age_dict):
    for k in list(ct_dict.keys()):
        if ( ( (ct_dict[k] >= 1).mean() >= 1. )
             or (ct_dict[k] >= 1).mean() >= 0.1 
                    and age_dict[k] > 10):
            x = pred_dict[k] / (ct_dict[k] + 1e-5)
            compr = zc.compress(pickle.dumps(x.astype(np.float32).round(3)))
            key = (PREFIX + 'spreds{}/'.format(2 if RELABEL else '') +  '{}/'.format(m)
                 + '{}_{:05d}'.format(k[0].split('/')[-1], k[1],) + '.pkl' + '.zstd')
            Thread(target = s3_push, args = (BUCKET, key, compr)).start()

            # s3.put_object(Bucket = BUCKET, Key = key, Body = compr)
            print(key, len(compr), ct_dict[k].mean(), age_dict[k], x.std())
            pred_dict.pop(k, None), ct_dict.pop(k, None), age_dict.pop(k, None)
        
        elif age_dict[k] > 20:
            age_dict[k] = 1 + age_dict.get(k, 0)
            if age_dict[k] > 10:
                pred_dict.pop(k, None), ct_dict.pop(k, None), age_dict.pop(k, None)
                print('del', k)

In [None]:
k = random.choice(list(model.pred_dict))
plt.plot(model.pred_dict[k] / model.ct_dict[k])
plt.plot(model.target_dict[k])
plt.ylim(0, 1.03);

In [None]:
try: s3.put_object(Bucket = BUCKET, 
              Key = PREFIX + 'preds/' + m + '.pkl' + '.zstd', 
              Body = zc.compress(pickle.dumps((model.pred_dict, model.target_dict, model.ct_dict))))
except Exception as e: print(e)

In [None]:
try: s3.put_object(Bucket = BUCKET, 
              Key = PREFIX + 'models/' + m + '.pt', 
              Body = pickle.dumps(model.model.state_dict()))
except Exception as e: print(e)

In [None]:
r = {} 
r['params'] = params
r['results'] = {k: v.item() if torch.is_tensor(v) else v 
                for k, v in trainer.callback_metrics.items()}
k = PREFIX + 'results/{}.json'.format(m)
s3.put_object(Bucket = BUCKET, 
              Key = k,
              Body = json.dumps(r), 
              )
k, r

In [None]:
# pred_dict, ct_dict, age_dict = {}, {}, defaultdict(int)
# # model = net
# ct = 0
# model.to(device)
# model.eval();

# xs, yps, fs, idxs = [], [], [], []
# iprior = None; fprior = None
# for batch in test_daily_loader:
#     x, y, s, frac, m_, f, i, flen = batch
#     with torch.no_grad():
#         yp, ypae, ypm = model(*[e.to(device)
#                                  for e in [x, m_, frac, flen]], adjust = False)
    
#     yps.append(yp[:, ::STORE_SUBSAMPLE].cpu())
#     # xs.append(x.cpu()); 
#     fs.append(f)
#     idxs.append(i // STORE_SUBSAMPLE)
#     if len(yps) >= 100: break;

#     if ((iprior is not None and not ( iprior == i // (STORE_SPLIT * STORE_SUBSAMPLE)).all())
#         or (fprior is not None and not all([fprior == e for e in f]))):
    
#         _yps, _fs, _idxs = stack_store(yps, fs, idxs)
#         process_store(_yps, _fs, _idxs, pred_dict, ct_dict, age_dict)
#         flush_store(pred_dict, ct_dict, age_dict)
#         yps, fs, idxs = [], [], []
#         ct += 1
#         # break;
#         # if fprior is not None and not all([fprior == e for e in f]):
#         #     break;
    
#     iprior = i[-1].item() // (STORE_SPLIT * STORE_SUBSAMPLE)
#     fprior = f[-1] 

# if len(yps) > 0:
#     _yps, _fs, _idxs = stack_store(yps, fs, idxs)
#     yps, fs, idxs = [], [], []
#     process_store(_yps, _fs, _idxs, pred_dict, ct_dict, age_dict)
    
# for i in range(100): flush_store(pred_dict, ct_dict, age_dict)

In [None]:
try: sqs.delete_message(QueueUrl = queue_url, ReceiptHandle = rh)
except Exception as e: print(e)

In [None]:
!sudo shutdown now

In [None]:
import datetime, json
paginator = s3.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket = BUCKET, Prefix = PREFIX + 'results/')
def flatten(l): return [item for sublist in l for item in sublist]
keys = flatten([[k['Key'] for k in page['Contents']
                    if k['LastModified'].replace(tzinfo = None) 
                            > datetime.datetime.now() - datetime.timedelta(hours = 4.3  )
                    # and k['LastModified'].replace(tzinfo = None) 
                #             < datetime.datetime.now() - datetime.timedelta(hours = 0.2)
                    # if k['LastModified'].replace(tzinfo = None) 
                    #         >= datetime.datetime(2023, 6, 6, 5, 12, 18,) # pseudos were here;
                    # if k['LastModified'].replace(tzinfo = None) 
                    #         <= datetime.datetime(2023, 6, 6, 10, 12, 18,)
                 ] for page in page_iterator])
def loadObj(k):
    s3 = boto3.client('s3')
    o = s3.get_object(Bucket = BUCKET, Key = k)
    # print(o)
    r = json.loads(o['Body'].read())
    r['m'] = k.split('/')[-1].split('.')[0]
    r['t'] = o['LastModified']
    return r
results = Parallel(os.cpu_count() * 3)(delayed(loadObj)(k) for k in keys)
len(results)

In [None]:
from collections import defaultdict
presults = defaultdict(list)
ms = {}; pms = defaultdict(list)
# f = random.choice(results)['params']['frac_pwr_mult']
for r in results:
    p = r['params']
    
    if 'seg' in p and p['seed'] < 400: continue;
    # if p['seed'] //100 in [4, 5]: continue;
    # if p['seq'] < 224: continue;
    # if p['dims'] < 384: continue;
    # if p.get('xformer_layers', 2) > 3: continue;
    # if p['frac_pwr_mult'] != f: continue
    # if p['se_dims'] > 0 and p.get('se_act', '') == 'PReLU': continue;
    r_ = {k: v for k, v in r['params'].items() 
              if k not in ['fold', 'seed', 'n_folds']}
    presults[json.dumps(r_)].append(r['results']['val_ap'])
    pms[json.dumps(r_)].append(r['m'])
    ms[r['m']] = r['params']

len(ms)


In [None]:
# dict to df, each element is row
df = pd.DataFrame({k: (np.mean(v), np.min(v), np.max(v), len(v))
      for k, v in presults.items()}, index = ['mean', 'min', 'max', 'ct']).T.sort_values('mean')[::-1]
# df = df.loc[[e for e in df.index 
#              if not any( [z in e for z in [ 'steps', 'h0', 'patch', 'mult', 'seq', 'layers' ]] ) ]]
df = df[df.ct >= 4 ]#params['n_folds'] == 0]
# df -= df.loc['{}']#max()
df.round(3)

In [None]:
def flatten(l): return [item for sublist in l for item in sublist]
select_ms = flatten([pms[e] for e in df.head(10).index
                        # if 'frac_adj": false' in e
                     ])

In [None]:
', '.join(select_ms)

In [None]:
df1 = pd.DataFrame([json.loads(k) for k in presults])
df2 = pd.DataFrame([(np.mean(v), np.min(v), np.max(v), len(v))
      for k, v in presults.items()], columns = ['mean', 'min', 'max', 'ct'])

In [None]:
pd.set_option('display.max_columns', 200)
pd.concat((df1, df2),axis = 1).sort_values('mean')[::-1][df2.ct >= 3 #df2.ct.max()
                                                         ].round(6).iloc[:,50:
                                                      # ][[c for c in df1.columns if '_' not  in c]
                                                          ]

In [None]:
# numpy add two arrays, expanding the smaller one on axis 0
def eadd(a, b):
    if isinstance(a, int): return b.copy();
    if a.shape[0] < b.shape[0]:
        a = np.pad(a, ((0, b.shape[0] - a.shape[0]), (0, 0)), mode = 'constant')
    elif a.shape[0] > b.shape[0]:
        b = np.pad(b, ((0, a.shape[0] - b.shape[0]), (0, 0)), mode = 'constant')
    return a + b

In [None]:
import pickle
zd = zstd.ZstdDecompressor()
def loadObj(k):
    file_path = DATA_PATH + 'obj_cache/' + k
    if not os.path.exists(file_path):
        s3 = boto3.client('s3')
        o = s3.get_object(Bucket = BUCKET, Key = k)
        os.makedirs(os.path.dirname(file_path), exist_ok = True)
        with open(file_path, 'wb') as f:
            f.write(o['Body'].read())
    r = pickle.loads(zd.decompress(open(file_path, 'rb').read()))
    return r

In [None]:
COMMON = ['2d57c2', 'e86b6e'][:2]
common_files = (tdcsfog_metadata.Id[tdcsfog_metadata.Subject.isin(COMMON)].tolist()
                     + defog_metadata.Id[defog_metadata.Subject.isin(COMMON)].tolist())
len(common_files)

In [None]:
def logit(x): return 1 / (1 + np.exp(-x))
def rlogit(x): return -np.log(1/(x * 0.9999 + 1e-4/2) - 1)

In [None]:
pred_totals = {}
target_totals = {}
ct_totals = {}
t_ct_totals = {}
scales = []
for m, p in ms.items():
    try:
        if m not in select_ms: continue
        r = loadObj(PREFIX + 'preds/' + m + '.pkl.zstd')
        mult = 1#/8 if p.get('seg') else 1; #print(mult)

        pred, target, ct = r
        # print(len(pred))
        pred, target, ct = [{k: v for k, v in p.items()
                              if not any([c in k for c in  common_files])}
                                    for p in [pred, target, ct]]
        # print(len(pred))
        spred = {k: pred[k] / (ct[k] + 1e-5) for k in pred.keys() }
        pred_total = np.stack([e.sum(0) for e in spred.values()]).sum(0)
        total = np.stack([e.sum(0) for e in target.values()]).sum(0)        
        scale = total / pred_total * mult
        scales.append(scale)
        print(scale.round(2))

        # print(m, pred_total, total, scale.round(2)); #break; 
        for k, v in spred.items():
            pred_totals[k] = eadd( pred_totals.get(k, 0), v * scale ) #
                                        # * logit(rlogit(v) + np.log(scale)) )
        for k, v in target.items():
            target_totals[k] = eadd(target_totals.get(k, 0), v)
            t_ct_totals[k] = eadd(t_ct_totals.get(k, 0), 1 * (v > -np.inf))
        for k, v in ct.items():
            ct_totals[k] = eadd(ct_totals.get(k, 0), (v > 0) * mult)
    except Exception as e:
        # raise e
        print('error', m, e)
    # break;        
assert ( set(pred_totals.keys()) == set(target_totals.keys()) 
                == set(ct_totals.keys()) == set(t_ct_totals.keys()) )

In [None]:
plt.plot(np.stack(scales))

In [None]:
# k = random.choice(list(hcommon_files))#.keys()))
k = random.choice(list(pred_totals.keys()))
plt.plot(pred_totals[k] / ct_totals[k])
plt.plot(target_totals[k] / t_ct_totals[k])
plt.ylim(0, 1.05);

In [None]:
# HCOMMON = ['2d57c2', 'e86b6e']
# hcommon_files = (tdcsfog_metadata.Id[tdcsfog_metadata.Subject.isin(HCOMMON)].tolist()
#                      + defog_metadata.Id[defog_metadata.Subject.isin(HCOMMON)].tolist())
# hcommon_files = [f for f in fog_files if any([c in f for c in hcommon_files])]

In [None]:
keys = [f for f in ct_totals.keys() if not any([z in f for z in common_files])]
print('{} of {}'.format(len(keys), len(ct_totals)))

In [None]:
from sklearn.metrics import average_precision_score

ct_dict, pred_dict, target_dict = ct_totals, pred_totals, target_totals
final_ys, final_yps = [], []
for k in keys:
    minlen = min([e.shape[0] for e in [ct_dict[k], pred_dict[k], target_dict[k]]])
    # print(minlen)
    # minlen = (ct_totals[k] > 0).sum()
    # print(minlen)
    # break;
    f = ct_dict[k][:minlen][:, 0] > 0
    final_ys.append(1 * (target_dict[k][:minlen][f]/ t_ct_totals[k][:minlen][f] > 0.5))
    final_yps.append(pred_dict[k][:minlen][f] / ct_dict[k][:minlen][f])
    assert (ct_dict[k].std(1) < 1e-5).all()
    assert ct_dict[k][:minlen][f].min() >= 1

final_ys, final_yps = np.concatenate(final_ys), np.concatenate(final_yps)

aps = []; N = 1
labels = 'htw'
for i in range(3):
    aps.append(average_precision_score(final_ys[::N, i], final_yps[::N,  i]))


In [None]:
aps, np.mean(aps), len(ms)

In [None]:
!rm -rf code/models code/params
!mkdir -p code/models code/params
def downloadModel(m, p):
    # m = r['m'] 
    s3 = boto3.client('s3')
    s3.download_file(Bucket = BUCKET, Key = PREFIX + 'models/' + m + '.pt', 
                     Filename = 'code/' + 'models/' + m + '.pt')
    json.dump(p, open('code/' + 'params/' + m + '.json', 'w'))
    
r = Parallel(os.cpu_count() * 3)(delayed(downloadModel)(m, p) for m, p in ms.items());
# total size of folds
!du -sh code/models


In [None]:
# !ls -lh code/models 

In [None]:
# !mkdir -p day2main
# !cp *py* day2main

In [None]:
# !kaggle datasets download stochoshi/walkdata -p code1 

In [None]:
# !ls code1

In [None]:
!cp -r code1/whl code
!cp -r code1/smp code

In [None]:

# kaggle create dataset in code/
# move .py files there
# submit as dataset

!mkdir -p code
json.dump({'title': 'Walk Dataset', 
           'id': 'stochoshi/walkdata4',
           'licenses': [{'name': 'CC0-1.0'}]}, 
          open('code/dataset-metadata.json', 'w'), indent = 4)
# !kaggle datasets init -p code
!cp *.py code
# !kaggle datasets create -p code  --dir-mode tar
!kaggle datasets version -p code -m "Walk Dataset" --dir-mode tar

In [None]:
# !sudo apt-get install zip
# !zip -r backup/Final.zip *.py *.ipynb