In [79]:
import numpy as np
import torch
from torch.nn import MSELoss
import torch.optim as optim
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from model import Attention, Feature_attention
import time
from torch import nn
import pickle
from sklearn.metrics import mean_squared_error as MSE
from collections import Counter
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import itertools

In [87]:
# create ensemble

models = os.listdir('./trained_models/')

model_names = np.array([i for i in models if 'biopsy_model_4_bracs_' in i])
scores = np.array(['0.' + j.split('_')[-1].split('.')[0] for j in model_names]).astype(float)

threshold1 = 0.54

threshold2 = 0.552

selected_scores =  np.logical_and(scores < threshold2, scores > threshold1)  

selected_models = model_names[selected_scores]

In [88]:
len(selected_models)

20

In [89]:
def inference(x, model, device):
    
    
    bag = np.load("/home/ngsci/project/save_resnet_embeddings_level4_biopsy_bags_bracs_float16/"+ x["biopsy_id"] + ".npy")
    # create a batch of 1
    if bag.shape[0] == 0: print('nulla')
    bag = torch.tensor(np.expand_dims(bag,axis=0))
    prediction = model(bag.float().to(device))
    
    return prediction.item()

In [90]:
def run_inference_on_model(model_name, csv_name="./final_splits/test_biopsy_unbalenced.csv"):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #device config
    
    model = torch.load(f"./trained_models/{model_name}", map_location=torch.device('cpu'))
    model = model.module.to(device)
    
    test_biopsy_unbalenced = pd.read_csv(csv_name)
    
    #test_biopsy_unbalenced["stage_pred"] = test_biopsy_unbalenced.apply(lambda x: inference(x, model, device),axis=1)
    
    return test_biopsy_unbalenced.apply(lambda x: inference(x, model, device),axis=1).values

In [91]:
preds = []

for m in tqdm(selected_models):
    
    preds.append(run_inference_on_model(m))

100%|██████████| 20/20 [2:59:38<00:00, 538.90s/it]  


In [92]:
len(preds)

20

In [93]:
# saving preds between 54 and 55
test_biopsy_unbalenced = pd.read_csv("./final_splits/test_biopsy_unbalenced.csv")

preds_between_54_55 = pd.DataFrame()
preds_between_54_55['biopsy_id'] = test_biopsy_unbalenced['biopsy_id'].values
preds_between_54_55['stage'] = test_biopsy_unbalenced['stage'].values

preds_between_54_55[selected_models] = np.array(preds).T

preds_between_54_55.to_csv('preds_between_54_55.csv')

In [36]:
pred_nr = np.arange(7)

c = list(itertools.combinations(pred_nr, 3))

subsets = np.array([list(i) for i in c])

In [58]:
subsets[:3]

array([[0, 1, 2],
       [0, 1, 3],
       [0, 1, 4]])

In [67]:
best_MSE = 1

for s in range(len(subsets)):
    
    current_MSE = MSE(np.mean(np.array(preds)[[subsets[s]]], axis=1)[0], test_biopsy_unbalenced.stage.values)
    
    if current_MSE < best_MSE:
        best_MSE = current_MSE
        best_s = s
        
print(best_s, best_MSE)    

32 0.4993289455835525


In [60]:
test_biopsy_unbalenced = pd.read_csv("./final_splits/test_biopsy_unbalenced.csv")

ensemble = pd.DataFrame()
ensemble['biopsy_id'] = test_biopsy_unbalenced['biopsy_id'].values
ensemble['stage'] = test_biopsy_unbalenced['stage'].values
ensemble['stage_pred'] = np.mean(np.array(preds), axis=0)

In [61]:
MSE(ensemble.stage_pred, ensemble.stage)

0.5058676191366585

### inference on holdout set

In [9]:
preds_holdout = []

for m in tqdm(selected_models):
    
    preds_holdout.append(run_inference_on_model(m, csv_name="./final_splits/HOLD_OUT.csv"))

100%|██████████| 7/7 [02:44<00:00, 23.55s/it]


In [10]:
pred_holdout_ensemble = pd.read_csv("./final_splits/HOLD_OUT.csv", index_col=0)
pred_holdout_ensemble['stage_pred'] =  np.mean(np.array(preds_holdout), axis=0)

filt_4higher = pred_holdout_ensemble['stage_pred'] >= 4.0
pred_holdout_ensemble["stage_pred"][filt_4higher] = 4.0

filt_0lower = pred_holdout_ensemble['stage_pred'] < 0.0
pred_holdout_ensemble["stage_pred"][filt_0lower] = 0.0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_holdout_ensemble["stage_pred"][filt_4higher] = 4.0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_holdout_ensemble["stage_pred"][filt_0lower] = 0.0


In [11]:
pred_holdout_ensemble.to_csv(  f'./preds/pred_{len(preds_holdout)}_ensemble_under_{threshold}'.replace('0.','0_')+'.csv', index=False)

In [10]:
import ngsci

ngsci.submit_contest_entry(
    "preds/pred_7_ensemble_under_0_54.csv", description="csabAIbio ensemble 7"
)

(<Result.SUCCESS: 1>, 'Success')