In [39]:
from pytorch_lightning import Trainer
from catinous.CatsinomModelGramCache import CatsinomModelGramCache
import catinous.CatsinomModelGramCache as catsmodel
from catinous import utils as cutils
from catinous import CatsinomDataset

import matplotlib.pyplot as plt
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.utils.data import DataLoader
import os
import torchvision.models as models
import torch.nn as nn
import torch
import torch.nn.functional as F
import argparse
import pytorch_lightning as pl
import sklearn 
from sklearn.metrics import confusion_matrix, auc, roc_curve
import torch
import pandas as pd
import seaborn as sns
import pickle
from py_jotools import mut, slurm
import numpy as np
import gc
import hashlib
import dill

%load_ext autoreload
%autoreload 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### default parameters for training

In [40]:
CatsinomModelGramCache.get_default_hparams()

{'root_dir': '/project/catinous/cat_data/',
 'datasetfile': 'catsinom_combined_dataset.csv',
 'batch_size': 8,
 'training_batch_size': 8,
 'transition_phase_after': 0.7,
 'cachemaximum': 128,
 'use_cache': True,
 'random_cache': True,
 'balance_cache': True,
 'force_misclassified': False,
 'direction': 'lr->hr',
 'continous': True,
 'noncontinous_steps': 3000,
 'noncontinous_train_splits': ['train', 'base_train'],
 'val_check_interval': 100,
 'base_model': None,
 'run_postfix': '1'}

### parameteres for slurm scheduler

In [41]:
sparams = {
    'binary': '/home/cir/jhofmanninger/env/candid/bin/python',
    'cwd': '/home/cir/jhofmanninger/Projects/catinous/',
    'gpu': 1,
    'partition': 'full',
    'memory': 15000,
    'jobname': 'catinous_base',
    'outputpath': '/home/cir/jhofmanninger/slurmoutput/',
    'mailuser': 'matthias.perkonigg@meduniwien.ac.at',
    'minutes': '240'}

# Base training

In [42]:
schedule = False

hparams={'continous':False,
         'datasetfile': 'catsinom_lr_dataset.csv',
         'noncontinous_train_splits': ['base_train'],
         'noncontinous_steps': 3000}
if schedule:
    slurm.srun(catsmodel.trained_model, [hparams], params=sparams, remote=True)
else:
    model, logs, df_cache, basemodel_lr = catsmodel.trained_model(hparams)
hparams['datasetfile'] = 'catsinom_hr_dataset.csv'
if schedule:
    slurm.srun(catsmodel.trained_model, [hparams], params=sparams, remote=True)
else:
    model, logs, df_cache, basemodel_hr = catsmodel.trained_model(hparams)

Read: /project/catinous/trained_models/batch_lr_base_train_1_2d20289ac9.pt
Read: /project/catinous/trained_models/batch_hr_base_train_1_98bf44d0f0.pt


# Continous training

#### lr->hr

In [64]:
hparams={'continous': True,
         'force_misclassified': True,
         'datasetfile': 'catsinom_combined_dataset.csv',
         'base_model': basemodel_lr,
         'val_check_interval': 30,
         'cachemaximum': 256}
base_lr_continous_params = []
for i in range(5):
    base_lr_continous_params.append(hparams.copy())
    base_lr_continous_params[-1]['run_postfix'] = i+1
    if not catsmodel.is_cached(base_lr_continous_params[-1]):
        slurm.srun(catsmodel.trained_model, [base_lr_continous_params[-1]], params=sparams, remote=True)    
    else:
        print('done')

INFO:root:Gram hooks and cache initialized. Cachesize: 256


1582018235661091
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/1582018235661091.job


INFO:root:Gram hooks and cache initialized. Cachesize: 256


15820182394876163
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15820182394876163.job


INFO:root:Gram hooks and cache initialized. Cachesize: 256


15820182415981078
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15820182415981078.job


INFO:root:Gram hooks and cache initialized. Cachesize: 256


15820182434725463
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15820182434725463.job


INFO:root:Gram hooks and cache initialized. Cachesize: 256


15820182465091124
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15820182465091124.job


In [62]:
hparams={'continous': True,
         'use_cache': False,
         'datasetfile': 'catsinom_combined_dataset.csv',
         'base_model': basemodel_lr,
         'val_check_interval': 30}
base_lr_continous_nocache_params = []
for i in range(5):
    base_lr_continous_nocache_params.append(hparams.copy())
    base_lr_continous_nocache_params[-1]['run_postfix'] = i+1
    if not catsmodel.is_cached(base_lr_continous_nocache_params[-1]):
        slurm.srun(catsmodel.trained_model, [base_lr_continous_nocache_params[-1]], params=sparams, remote=True)   
    else:
        print('done')

INFO:root:Gram hooks and cache initialized. Cachesize: 128


done


INFO:root:Gram hooks and cache initialized. Cachesize: 128


done


INFO:root:Gram hooks and cache initialized. Cachesize: 128


done


INFO:root:Gram hooks and cache initialized. Cachesize: 128


done


INFO:root:Gram hooks and cache initialized. Cachesize: 128


done


In [46]:
model, logs, df_cache, basemodel_lr = catsmodel.trained_model(base_lr_continous_nocache_params[-1], show_progress=True)

INFO:root:Gram hooks and cache initialized. Cachesize: 128
INFO:root:GPU available: True, used: True
INFO:root:VISIBLE GPUS: 0
INFO:root:
    | Name                        | Type              | Params
--------------------------------------------------------------
0   | model                       | ResNet            | 24 M  
1   | model.conv1                 | Conv2d            | 9 K   
2   | model.bn1                   | BatchNorm2d       | 128   
3   | model.relu                  | ReLU              | 0     
4   | model.maxpool               | MaxPool2d         | 0     
5   | model.layer1                | Sequential        | 215 K 
6   | model.layer1.0              | Bottleneck        | 75 K  
7   | model.layer1.0.conv1        | Conv2d            | 4 K   
8   | model.layer1.0.bn1          | BatchNorm2d       | 128   
9   | model.layer1.0.conv2        | Conv2d            | 36 K  
10  | model.layer1.0.bn2          | BatchNorm2d       | 128   
11  | model.layer1.0.conv3        | Conv2d 

HBox(children=(IntProgress(value=0, description='Validation sanity check', layout=Layout(flex='2'), max=5, sty…



HBox(children=(IntProgress(value=1, bar_style='info', layout=Layout(flex='2'), max=1), HTML(value='')), layout…



HBox(children=(IntProgress(value=0, description='Validating', layout=Layout(flex='2'), max=201, style=Progress…

HBox(children=(IntProgress(value=0, description='Validating', layout=Layout(flex='2'), max=201, style=Progress…

HBox(children=(IntProgress(value=0, description='Validating', layout=Layout(flex='2'), max=201, style=Progress…

HBox(children=(IntProgress(value=0, description='Validating', layout=Layout(flex='2'), max=201, style=Progress…

HBox(children=(IntProgress(value=0, description='Validating', layout=Layout(flex='2'), max=201, style=Progress…

HBox(children=(IntProgress(value=0, description='Validating', layout=Layout(flex='2'), max=201, style=Progress…

HBox(children=(IntProgress(value=0, description='Validating', layout=Layout(flex='2'), max=201, style=Progress…

HBox(children=(IntProgress(value=0, description='Validating', layout=Layout(flex='2'), max=201, style=Progress…

HBox(children=(IntProgress(value=0, description='Validating', layout=Layout(flex='2'), max=201, style=Progress…

HBox(children=(IntProgress(value=0, description='Validating', layout=Layout(flex='2'), max=201, style=Progress…

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

INFO:root:
Unfortunately, your original traceback can not be constructed.



Traceback (most recent call last):
  File "/home/jhofmanninger/anaconda3/envs/candid/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-46-a9cfe90d3685>", line 1, in <module>
    model, logs, df_cache, basemodel_lr = catsmodel.trained_model(base_lr_continous_nocache_params[-1], show_progress=True)
  File "/home/jhofmanninger/Projects/catinous/catinous/CatsinomModelGramCache.py", line 370, in trained_model
    torch.save(model.state_dict(), weights_path)
  File "/home/jhofmanninger/anaconda3/envs/candid/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 766, in fit
    self.single_gpu_train(model)
  File "/home/jhofmanninger/anaconda3/envs/candid/lib/python3.6/site-packages/pytorch_lightning/trainer/distrib_parts.py", line 441, in single_gpu_train
    self.run_pretrain_routine(model)
  File "/home/jhofmanninger/anaconda3/envs/candid/lib/python3.6/site-pack

KeyboardInterrupt: 

In [19]:
logs

Unnamed: 0,created_at,epoch,train_loss,val_acc_hr,val_acc_lr,val_loss_hr,val_loss_lr
0,2020-02-13 11:50:32.981780,0,0.758581,,,,
1,2020-02-13 11:50:37.581619,0,0.670475,,,,
2,2020-02-13 11:50:43.870400,0,0.653133,,,,
3,2020-02-13 11:50:50.094833,0,0.827174,,,,
4,2020-02-13 11:50:55.395019,0,1.230527,,,,
...,...,...,...,...,...,...,...
325,2020-02-13 12:32:26.755774,0,0.163441,,,,
326,2020-02-13 12:32:31.734975,0,0.461115,,,,
327,2020-02-13 12:32:37.083267,0,0.055282,,,,
328,2020-02-13 12:32:42.182645,0,0.121794,,,,


#### hr->lr

In [19]:
hparams['base_model'] = basemodel_hr,
hparams['direction'] = 'hr->lr'     
base_lr_continous_params = []
for i in range(5):
    base_lr_continous_params.append(hparams.copy())
    base_lr_continous_params[-1]['run_postfix'] = i+1
    if schedule:
        slurm.srun(catsmodel.trained_model, [base_lr_continous_params[-1]], params=sparams, remote=True)   

1581931605124311
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/1581931605124311.job
15819316076048906
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15819316076048906.job
15819316091844318
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15819316091844318.job
15819316107077065
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15819316107077065.job
15819316125707395
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15819316125707395.job


In [8]:
if schedule
model, logs, df_cache, modelpath = catsmodel.trained_model(base_lr_continous_params[0])

INFO:root:Gram hooks and cache initialized. Cachesize: 128
INFO:root:GPU available: True, used: True
INFO:root:VISIBLE GPUS: 0
INFO:root:
    | Name                        | Type              | Params
--------------------------------------------------------------
0   | model                       | ResNet            | 24 M  
1   | model.conv1                 | Conv2d            | 9 K   
2   | model.bn1                   | BatchNorm2d       | 128   
3   | model.relu                  | ReLU              | 0     
4   | model.maxpool               | MaxPool2d         | 0     
5   | model.layer1                | Sequential        | 215 K 
6   | model.layer1.0              | Bottleneck        | 75 K  
7   | model.layer1.0.conv1        | Conv2d            | 4 K   
8   | model.layer1.0.bn1          | BatchNorm2d       | 128   
9   | model.layer1.0.conv2        | Conv2d            | 36 K  
10  | model.layer1.0.bn2          | BatchNorm2d       | 128   
11  | model.layer1.0.conv3        | Conv2d 

KeyboardInterrupt: 

In [21]:
model

CatsinomModelGramCache(
  (model): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequentia

In [18]:
sparams = {
    'binary': '/home/cir/jhofmanninger/env/candid/bin/python',
    'cwd': '/home/cir/jhofmanninger/Projects/catinous',
    'gpu': 1,
    'partition': 'full',
    'memory': 15000,
    'jobname': 'catinous_test'}
slurm.srun(catsmodel.trained_model, [hparams], params=sparams, remote=True)

15815110948576212
sshpass -f ~/.ssh/pass ssh cn1.cir.meduniwien.ac.at sbatch --export=NONE --partition full /scratch/15815110948576212.job


'/scratch/15815110948576212_res.dll'

In [None]:
dill.load(open('/scratch/158150528649969.dll','rb'))['function'](hparams)

In [None]:
/scratch/15815045002145245_res.dll

In [None]:
model = CatsinomModelGramCache(hparams=hparams, device=torch.device('cuda'))
logger = cutils.pllogger(model.hparams)
trainer = Trainer(gpus=1, max_epochs=1, early_stop_callback=False, logger=logger, val_check_interval=3000, show_progress_bar=True, checkpoint_callback=False)
trainer.fit(model)
torch.save(model.state_dict(), '/project/catinous/trained_models/' + expname + '_run_'+str(i)+'.pt')
save_cache_to_csv(model.trainingscache.cachelist, '/project/catinous/trained_cache/' + expname + '_run_'+str(i)+'.csv')

In [None]:
slurm.srun(trainer.fit, model, params=sparams, remote=True)

In [7]:
slurm.slurm_params_default

{'binary': '/home/jhofmanninger/anaconda3/envs/candid/bin/python',
 'outputpath': '/home/cir/jhofmanninger/slurmoutput/',
 'jobname': '',
 'days': 0,
 'hours': '0',
 'minutes': '10',
 'memory': 2000,
 'ntasks': 1,
 'cpusptask': 4,
 'qos': 'normal',
 'mailuser': 'johannes.hofmanninger@meduniwien.ac.at',
 'cwd': '/home/jhofmanninger/Projects/catinous',
 'dillfile': '',
 'gpu': 0,
 'partition': 'cir',
 'paths': []}

In [4]:
def save_cache_to_csv(cache, savepath):
    df_cache = pd.DataFrame({'filepath':[ci.filepath for ci in cache], 'label': [ci.label.cpu().numpy()[0] for ci in cache], 'res': [ci.res for ci in cache], 'traincounter': [ci.traincounter for ci in cache]})
    df_cache.to_csv(savepath, index=False, index_label=False)

In [None]:
hparams = get_default_hparams()
hparams['force_misclassified'] = True
hparams['cachemaximum'] = 128
hparams['continous'] = False
expname = 'continous_random_cache_transphase_force_misclassified_bsize123'
model = CatsinomModelGramCache(argparse.Namespace(**hparams), device = torch.device('cuda'))

basemodel = '/project/catinous/trained_models/lrbase_iterations.pt'
model.load_state_dict(torch.load(basemodel))

logger = pllogging.TestTubeLogger( 'catinous_log_iterations', name=expname)
trainer = Trainer(gpus=1, max_epochs=1, early_stop_callback=False, logger=logger, val_check_interval=10, show_progress_bar=False, checkpoint_callback=False)
trainer.fit(model)

INFO:root:gpu available: True, used: True
INFO:root:VISIBLE GPUS: 0
INFO:root:
              Name               Type Params
0            model             ResNet   24 M
1      model.conv1             Conv2d    9 K
2        model.bn1        BatchNorm2d  128  
3       model.relu               ReLU    0  
4    model.maxpool          MaxPool2d    0  
..             ...                ...    ...
150       model.fc         Sequential    1 M
151     model.fc.0             Linear    1 M
152     model.fc.1        BatchNorm1d    1 K
153     model.fc.2             Linear  513  
154           loss  BCEWithLogitsLoss    0  

[155 rows x 3 columns]


In [None]:
#testing different transition phases

transitionphase = [1.0]

for tp in transitionphase:
    for i in range(2):
        hparams = get_default_hparams()
        hparams['transition_phase_after'] = tp
        
        expname = 'continous_random_cache_transphase_' + str(tp)
        
        model = CatsinomModelGramCache(argparse.Namespace(**hparams), device = torch.device('cuda'))

        model.load_state_dict(torch.load(basemodel))

        logger = pllogging.TestTubeLogger('catinous_log_iterations', name=expname)
        trainer = Trainer(gpus=1, max_epochs=1, early_stop_callback=False, logger=logger, val_check_interval=10, show_progress_bar=False)
        trainer.fit(model)
        torch.save(model.state_dict(), '/project/catinous/trained_models/' + expname + '_run_'+str(i)+'.pt')

        save_cache_to_csv(model.trainingscache.cachelist, '/project/catinous/trained_cache/' + expname + '_run_'+str(i)+'.csv')
        
        trainer = None
        model = None
        trainer = None
        torch.cuda.empty_cache()
        gc.collect()

INFO:root:gpu available: True, used: True
INFO:root:VISIBLE GPUS: 0
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
INFO:root:
              Name               Type Params
0            model             ResNet   24 M
1      model.conv1             Conv2d    9 K
2        model.bn1        BatchNorm2d  128  
3       model.relu               ReLU    0  
4    model.maxpool          MaxPool2d    0  
..             ...                ...    ...
150       model.fc         Sequential    1 M
151     model.fc.0             Linear    1 M
152     model.fc.1        BatchNorm1d    1 K
153     model.fc.2             Linear  513  
154           loss  BCEWithLogitsLoss    0  

[155 rows x 3 columns]


In [None]:
#testing different cache sizes
cache_size = [16, 128, 256]

for cs in cache_size:
    for i in range(3):
        hparams = get_default_hparams()
        hparams['cachemaximum'] = cs
        
        expname = 'continous_random_cache_cachesize_' + str(cs)
        
        model = CatsinomModelGramCache(argparse.Namespace(**hparams), device = torch.device('cuda'))

        model.load_state_dict(torch.load(basemodel))

        logger = pllogging.TestTubeLogger('catinous_log_iterations', name=expname)
        trainer = Trainer(gpus=1, max_epochs=1, early_stop_callback=False, logger=logger, val_check_interval=10, show_progress_bar=False)
        trainer.fit(model)
        torch.save(model.state_dict(), '/project/catinous/trained_models/' + expname + '_run_'+str(i)+'.pt')

        save_cache_to_csv(model.trainingscache.cachelist, '/project/catinous/trained_cache/' + expname + '_run_'+str(i)+'.csv')

        trainer = None
        model = None
        torch.cuda.empty_cache()