In [7]:
from pytorch_lightning import Trainer
from catinous.CatsinomModelGramCache import CatsinomModelGramCache
import catinous.CatsinomModelGramCache as catsmodel
from catinous import utils as cutils
from catinous import CatsinomDataset

import matplotlib.pyplot as plt
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.utils.data import DataLoader
import os
import torchvision.models as models
import torch.nn as nn
import torch
import torch.nn.functional as F
import argparse
import pytorch_lightning as pl
import sklearn 
from sklearn.metrics import confusion_matrix, auc, roc_curve
import torch
import pandas as pd
import seaborn as sns
import pickle
from py_jotools import mut, slurm
import numpy as np
import gc
import hashlib
import dill
import sys

%load_ext autoreload
%autoreload 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### default parameters for training

In [8]:
cutils.TRAINED_MODELS_FOLDER

'/project/catinous/trained_models/'

In [9]:
CatsinomModelGramCache.get_default_hparams()

{'root_dir': '/project/catinous/cat_data/',
 'datasetfile': 'catsinom_combined_dataset.csv',
 'batch_size': 8,
 'training_batch_size': 8,
 'transition_phase_after': 0.7,
 'cachemaximum': 128,
 'use_cache': True,
 'random_cache': True,
 'balance_cache': True,
 'force_misclassified': False,
 'direction': 'lr->hr',
 'continous': True,
 'noncontinous_steps': 3000,
 'noncontinous_train_splits': ['train', 'base_train'],
 'EWC': False,
 'EWC_dataset': None,
 'EWC_lambda': 1000,
 'EWC_bn_off': False,
 'val_check_interval': 100,
 'base_model': None,
 'run_postfix': '1'}

### parameteres for slurm scheduler

In [10]:
sparams = {
    'binary': '/home/cir/jhofmanninger/env/candid/bin/python',
    'cwd': '/home/cir/jhofmanninger/Projects/catinous/',
    'gpu': 1,
    'partition': 'full',
    'memory': 25000,
    'jobname': 'catinous_base',
    'outputpath': '/home/cir/jhofmanninger/slurmoutput/',
    'mailuser': 'j.hofmanninger@gmail.com',
    'minutes': '300'}

# Base training

In [11]:
schedule = False

hparams={'continous':False,
         'datasetfile': 'catsinom_lr_dataset.csv',
         'noncontinous_train_splits': ['base_train'],
         'noncontinous_steps': 3000}
if schedule:
    slurm.srun(catsmodel.trained_model, [hparams], params=sparams, remote=True)
else:
    model, logs, df_cache, basemodel_lr = catsmodel.trained_model(hparams)
hparams['datasetfile'] = 'catsinom_hr_dataset.csv'
if schedule:
    slurm.srun(catsmodel.trained_model, [hparams], params=sparams, remote=True)
else:
    model, logs, df_cache, basemodel_hr = catsmodel.trained_model(hparams)

Read: /project/catinous/trained_models/batch_lr_base_train_1_2d20289ac9.pt
Read: /project/catinous/trained_models/batch_hr_base_train_1_98bf44d0f0.pt


# Continous training

#### lr->hr

In [12]:
cache_sizes = [16, 32, 48, 64, 80, 128,192,256,320]
# dataset = 'catsinom_combined_dataset.csv'
# dataset = 'catsinom_combined_hrlowshift_dataset.csv'
dataset = 'catsinom_combined_dsts3_dataset.csv'

In [22]:
hparams={'continous': True,
         'force_misclassified': True,
         'datasetfile': dataset,
         'base_model': basemodel_lr,
         'val_check_interval': 30,
         'cachemaximum': 64}
base_lr_continous_params = []
for i in range(5):
    base_lr_continous_params.append(hparams.copy())
    base_lr_continous_params[-1]['run_postfix'] = i+1
    if not catsmodel.is_cached(base_lr_continous_params[-1]):
#         slurm.srun(catsmodel.trained_model, [base_lr_continous_params[-1]], params=sparams, remote=True)  
        print('.')
    else:
#         os.remove(catsmodel.cached_path(base_lr_continous_params[-1]))
        print('done')

INFO:root:Gram hooks and cache initialized. Cachesize: 64


done


INFO:root:Gram hooks and cache initialized. Cachesize: 64


.


INFO:root:Gram hooks and cache initialized. Cachesize: 64


done


INFO:root:Gram hooks and cache initialized. Cachesize: 64


done


INFO:root:Gram hooks and cache initialized. Cachesize: 64


done


### gram weights

In [29]:
hparams={'continous': True,
         'force_misclassified': True,
         'datasetfile': dataset,
         'base_model': basemodel_lr,
         'val_check_interval': 30,
         'cachemaximum': 64,
         'gram_weights': [0, 0, 0, 1]}
base_lr_continous_params = []
for i in range(5):
    base_lr_continous_params.append(hparams.copy())
    base_lr_continous_params[-1]['run_postfix'] = i+1
    if not catsmodel.is_cached(base_lr_continous_params[-1]):
        slurm.srun(catsmodel.trained_model, [base_lr_continous_params[-1]], params=sparams, remote=True)  
        print('.')
    else:
#         os.remove(catsmodel.cached_path(base_lr_continous_params[-1]))
        print('done')
hparams['gram_weights'] = [1, 0, 0, 0]
for i in range(5):
    base_lr_continous_params.append(hparams.copy())
    base_lr_continous_params[-1]['run_postfix'] = i+1
    if not catsmodel.is_cached(base_lr_continous_params[-1]):
        slurm.srun(catsmodel.trained_model, [base_lr_continous_params[-1]], params=sparams, remote=True)  
        print('.')
    else:
#         os.remove(catsmodel.cached_path(base_lr_continous_params[-1]))
        print('done')

INFO:root:Gram hooks and cache initialized. Cachesize: 64


15833185273995917
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15833185273995917.job
Submitted batch job 2778088

.


INFO:root:Gram hooks and cache initialized. Cachesize: 64


1583318529781936
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/1583318529781936.job
Submitted batch job 2778089

.


INFO:root:Gram hooks and cache initialized. Cachesize: 64


15833185315398552
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15833185315398552.job
Submitted batch job 2778090

.


INFO:root:Gram hooks and cache initialized. Cachesize: 64


15833185333279798
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15833185333279798.job
Submitted batch job 2778091

.


INFO:root:Gram hooks and cache initialized. Cachesize: 64


15833185354717176
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15833185354717176.job
Submitted batch job 2778092

.


INFO:root:Gram hooks and cache initialized. Cachesize: 64


15833185372595823
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15833185372595823.job
Submitted batch job 2778093

.


INFO:root:Gram hooks and cache initialized. Cachesize: 64


15833185393046706
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15833185393046706.job
Submitted batch job 2778094

.


INFO:root:Gram hooks and cache initialized. Cachesize: 64


15833185414643734
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15833185414643734.job
Submitted batch job 2778095

.


INFO:root:Gram hooks and cache initialized. Cachesize: 64


15833185432342792
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15833185432342792.job
Submitted batch job 2778096

.


INFO:root:Gram hooks and cache initialized. Cachesize: 64


15833185452118936
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15833185452118936.job
Submitted batch job 2778097

.


In [87]:
hparams={'continous': True,
         'use_cache': False,
         'datasetfile': dataset,
         'base_model': basemodel_lr,
         'val_check_interval': 30}
base_lr_continous_nocache_params = []
for i in range(5):
    base_lr_continous_nocache_params.append(hparams.copy())
    base_lr_continous_nocache_params[-1]['run_postfix'] = i+1
    if not catsmodel.is_cached(base_lr_continous_nocache_params[-1]):
#         slurm.srun(catsmodel.trained_model, [base_lr_continous_nocache_params[-1]], params=sparams, remote=True)   
        print('.')
    else:
        print('done')

done
done
done
done
done


#### ewc

In [106]:
# lambdas 1, 1000, 10000, 100000
# dataset = 'catsinom_combined_hrlowshift_dataset.csv'
# dataset = 'catsinom_combined_dataset.csv'
dataset = 'catsinom_combined_dsts3_dataset.csv'
hparams={'continous': True,
         'use_cache': False,
         'datasetfile': dataset,
         'base_model': basemodel_lr,
         'EWC': True,
         'EWC_dataset': 'catsinom_lr_dataset.csv',
         'EWC_lambda': 100000,
         'EWC_bn_off': True,
         'val_check_interval': 30}
params = []
for i in range(5):
    params.append(hparams.copy())
    params[-1]['run_postfix'] = i+1
    if not catsmodel.is_cached(params[-1]):
        slurm.srun(catsmodel.trained_model, [params[-1]], params=sparams, remote=True) 
        print('.')
    else:
        print('done')

15831570656883543
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15831570656883543.job
.
15831570681237369
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15831570681237369.job
.
15831570702278643
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15831570702278643.job
.
1583157072508678
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/1583157072508678.job
.
15831570746035147
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15831570746035147.job
.


### Full Training

In [6]:
hparams={'continous':False,
         'datasetfile': 'catsinom_combined_dsts3_dataset.csv',
         'noncontinous_train_splits': ['base_train', 'train'],
         'noncontinous_steps': 10000}
params = []
for i in range(5):
    params.append(hparams.copy())
    params[-1]['run_postfix'] = i+1
    if not catsmodel.is_cached(params[-1]):
        slurm.srun(catsmodel.trained_model, [params[-1]], params=sparams, remote=True)
        print('.')
    else:
        print('done')

15833124578081148
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15833124578081148.job
Submitted batch job 2777774

.
15833124606165512
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15833124606165512.job
Submitted batch job 2777776

.
15833124627257304
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15833124627257304.job
Submitted batch job 2777777

.
15833124649307358
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15833124649307358.job
Submitted batch job 2777778

.
15833124670935056
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15833124670935056.job
Submitted batch job 2777779

.


In [25]:
hparams={'continous': True,
         'force_misclassified': True,
         'datasetfile': dataset,
         'base_model': basemodel_lr,
         'val_check_interval': 30,
         'cachemaximum': 64,
         'run_postfix': 1,
         'gram_weights': [0, 0, 0, 1]}
catsmodel.trained_model(hparams)

INFO:root:Gram hooks and cache initialized. Cachesize: 64
INFO:root:GPU available: True, used: True
INFO:root:VISIBLE GPUS: 0
INFO:root:
    | Name                        | Type              | Params
--------------------------------------------------------------
0   | model                       | ResNet            | 24 M  
1   | model.conv1                 | Conv2d            | 9 K   
2   | model.bn1                   | BatchNorm2d       | 128   
3   | model.relu                  | ReLU              | 0     
4   | model.maxpool               | MaxPool2d         | 0     
5   | model.layer1                | Sequential        | 215 K 
6   | model.layer1.0              | Bottleneck        | 75 K  
7   | model.layer1.0.conv1        | Conv2d            | 4 K   
8   | model.layer1.0.bn1          | BatchNorm2d       | 128   
9   | model.layer1.0.conv2        | Conv2d            | 36 K  
10  | model.layer1.0.bn2          | BatchNorm2d       | 128   
11  | model.layer1.0.conv3        | Conv2d  

KeyboardInterrupt: 

In [None]:
catsmodel.trained_model(params[-1])

#### hr->lr

In [19]:
hparams['base_model'] = basemodel_hr,
hparams['direction'] = 'hr->lr'     
base_lr_continous_params = []
for i in range(5):
    base_lr_continous_params.append(hparams.copy())
    base_lr_continous_params[-1]['run_postfix'] = i+1
    if schedule:
        slurm.srun(catsmodel.trained_model, [base_lr_continous_params[-1]], params=sparams, remote=True)   

1581931605124311
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/1581931605124311.job
15819316076048906
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15819316076048906.job
15819316091844318
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15819316091844318.job
15819316107077065
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15819316107077065.job
15819316125707395
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15819316125707395.job


In [None]:
if schedule
model, logs, df_cache, modelpath = catsmodel.trained_model(base_lr_continous_params[0])

In [None]:
model

In [18]:
sparams = {
    'binary': '/home/cir/jhofmanninger/env/candid/bin/python',
    'cwd': '/home/cir/jhofmanninger/Projects/catinous',
    'gpu': 1,
    'partition': 'full',
    'memory': 15000,
    'jobname': 'catinous_test'}
slurm.srun(catsmodel.trained_model, [hparams], params=sparams, remote=True)

15815110948576212
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15815110948576212.job


'/scratch/15815110948576212_res.dll'

In [None]:
dill.load(open('/scratch/158150528649969.dll','rb'))['function'](hparams)

In [None]:
/scratch/15815045002145245_res.dll

In [None]:
model = CatsinomModelGramCache(hparams=hparams, device=torch.device('cuda'))
logger = cutils.pllogger(model.hparams)
trainer = Trainer(gpus=1, max_epochs=1, early_stop_callback=False, logger=logger, val_check_interval=3000, show_progress_bar=True, checkpoint_callback=False)
trainer.fit(model)
torch.save(model.state_dict(), '/project/catinous/trained_models/' + expname + '_run_'+str(i)+'.pt')
save_cache_to_csv(model.trainingscache.cachelist, '/project/catinous/trained_cache/' + expname + '_run_'+str(i)+'.csv')

In [None]:
slurm.srun(trainer.fit, model, params=sparams, remote=True)

In [7]:
slurm.slurm_params_default

{'binary': '/home/jhofmanninger/anaconda3/envs/candid/bin/python',
 'outputpath': '/home/cir/jhofmanninger/slurmoutput/',
 'jobname': '',
 'days': 0,
 'hours': '0',
 'minutes': '10',
 'memory': 2000,
 'ntasks': 1,
 'cpusptask': 4,
 'qos': 'normal',
 'mailuser': 'johannes.hofmanninger@meduniwien.ac.at',
 'cwd': '/home/jhofmanninger/Projects/catinous',
 'dillfile': '',
 'gpu': 0,
 'partition': 'cir',
 'paths': []}

In [4]:
def save_cache_to_csv(cache, savepath):
    df_cache = pd.DataFrame({'filepath':[ci.filepath for ci in cache], 'label': [ci.label.cpu().numpy()[0] for ci in cache], 'res': [ci.res for ci in cache], 'traincounter': [ci.traincounter for ci in cache]})
    df_cache.to_csv(savepath, index=False, index_label=False)

In [None]:
hparams = get_default_hparams()
hparams['force_misclassified'] = True
hparams['cachemaximum'] = 128
hparams['continous'] = False
expname = 'continous_random_cache_transphase_force_misclassified_bsize123'
model = CatsinomModelGramCache(argparse.Namespace(**hparams), device = torch.device('cuda'))

basemodel = '/project/catinous/trained_models/lrbase_iterations.pt'
model.load_state_dict(torch.load(basemodel))

logger = pllogging.TestTubeLogger( 'catinous_log_iterations', name=expname)
trainer = Trainer(gpus=1, max_epochs=1, early_stop_callback=False, logger=logger, val_check_interval=10, show_progress_bar=False, checkpoint_callback=False)
trainer.fit(model)

INFO:root:gpu available: True, used: True
INFO:root:VISIBLE GPUS: 0
INFO:root:
              Name               Type Params
0            model             ResNet   24 M
1      model.conv1             Conv2d    9 K
2        model.bn1        BatchNorm2d  128  
3       model.relu               ReLU    0  
4    model.maxpool          MaxPool2d    0  
..             ...                ...    ...
150       model.fc         Sequential    1 M
151     model.fc.0             Linear    1 M
152     model.fc.1        BatchNorm1d    1 K
153     model.fc.2             Linear  513  
154           loss  BCEWithLogitsLoss    0  

[155 rows x 3 columns]


In [None]:
#testing different transition phases

transitionphase = [1.0]

for tp in transitionphase:
    for i in range(2):
        hparams = get_default_hparams()
        hparams['transition_phase_after'] = tp
        
        expname = 'continous_random_cache_transphase_' + str(tp)
        
        model = CatsinomModelGramCache(argparse.Namespace(**hparams), device = torch.device('cuda'))

        model.load_state_dict(torch.load(basemodel))

        logger = pllogging.TestTubeLogger('catinous_log_iterations', name=expname)
        trainer = Trainer(gpus=1, max_epochs=1, early_stop_callback=False, logger=logger, val_check_interval=10, show_progress_bar=False)
        trainer.fit(model)
        torch.save(model.state_dict(), '/project/catinous/trained_models/' + expname + '_run_'+str(i)+'.pt')

        save_cache_to_csv(model.trainingscache.cachelist, '/project/catinous/trained_cache/' + expname + '_run_'+str(i)+'.csv')
        
        trainer = None
        model = None
        trainer = None
        torch.cuda.empty_cache()
        gc.collect()

INFO:root:gpu available: True, used: True
INFO:root:VISIBLE GPUS: 0
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
INFO:root:
              Name               Type Params
0            model             ResNet   24 M
1      model.conv1             Conv2d    9 K
2        model.bn1        BatchNorm2d  128  
3       model.relu               ReLU    0  
4    model.maxpool          MaxPool2d    0  
..             ...                ...    ...
150       model.fc         Sequential    1 M
151     model.fc.0             Linear    1 M
152     model.fc.1        BatchNorm1d    1 K
153     model.fc.2             Linear  513  
154           loss  BCEWithLogitsLoss    0  

[155 rows x 3 columns]


In [None]:
#testing different cache sizes
cache_size = [16, 128, 256]

for cs in cache_size:
    for i in range(3):
        hparams = get_default_hparams()
        hparams['cachemaximum'] = cs
        
        expname = 'continous_random_cache_cachesize_' + str(cs)
        
        model = CatsinomModelGramCache(argparse.Namespace(**hparams), device = torch.device('cuda'))

        model.load_state_dict(torch.load(basemodel))

        logger = pllogging.TestTubeLogger('catinous_log_iterations', name=expname)
        trainer = Trainer(gpus=1, max_epochs=1, early_stop_callback=False, logger=logger, val_check_interval=10, show_progress_bar=False)
        trainer.fit(model)
        torch.save(model.state_dict(), '/project/catinous/trained_models/' + expname + '_run_'+str(i)+'.pt')

        save_cache_to_csv(model.trainingscache.cachelist, '/project/catinous/trained_cache/' + expname + '_run_'+str(i)+'.csv')

        trainer = None
        model = None
        torch.cuda.empty_cache()