In [None]:
from fastai.tabular.all import *
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from fastai.callback import *
from tqdm.notebook import tqdm
import sys
sys.path.append('/kaggle/input/iterative-stratification/iterative-stratification-master')
#print(sys.path)
#!ls ../input/iterative-stratification/iterative-stratification-master
#from iterstrat import ml_stratifiers
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
#from ml_stratifiers import MultilabelStratifiedKFold
import copy
from torch.distributions.beta import Beta
from sklearn.preprocessing import QuantileTransformer

In [None]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
KAGGLE = True
TRAIN = True
INFERENCE = True
PATH = '../input/lish-moa/' if KAGGLE else None

print(PATH)
test_features = pd.read_csv(PATH + 'test_features.csv')
train_features = pd.read_csv(PATH  + 'train_features.csv')
train_targets_scored = pd.read_csv(PATH + 'train_targets_scored.csv')
train_targets_nonscored = pd.read_csv(PATH + 'train_targets_nonscored.csv')
sample_submission = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')
drug_ids = pd.read_csv(PATH + 'train_drug.csv')

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [None]:
genes_cols = [col for col in train_features.columns if col.startswith('g-')]
cells_cols = [col for col in train_features.columns if col.startswith('c-')]


In [None]:
train_and_test_genes_features = pd.concat([train_features[genes_cols],test_features[genes_cols]])
train_and_test_cell_features = pd.concat([train_features[cells_cols],test_features[cells_cols]])
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
n_comp_genes = 600
pca_genes = PCA(n_components=n_comp_genes)
pca_names_genes = ['pca_genes_' + str(i) for i in range(300)]#range(n_comp_genes)]
pca_genes.fit(train_and_test_genes_features)
print(pca_genes.explained_variance_ratio_[:300].sum())
n_comp_cells = 60
pca_cells = PCA(n_components=n_comp_cells)
pca_names_cells = ['pca_cells_' + str(i) for i in range(n_comp_cells)]
pca_cells.fit(train_and_test_cell_features)
print(pca_cells.explained_variance_ratio_.sum())

#np.sum(pca.explained_variance_ratio_),pca.explained_variance_ratio_
train_pca_features_genes = pca_genes.transform(train_features[genes_cols])[:, :300]
test_pca_features_genes = pca_genes.transform(test_features[genes_cols])[:,:300]
train_features[pca_names_genes] = train_pca_features_genes
test_features[pca_names_genes] = test_pca_features_genes

train_pca_features_cells= pca_cells.transform(train_features[cells_cols])
test_pca_features_cells = pca_cells.transform(test_features[cells_cols])
train_features[pca_names_cells] = train_pca_features_cells
test_features[pca_names_cells] = test_pca_features_cells

#tmp_df = pd.DataFrame(columns = pca_names)
#tmp_df[pca_names] = train_pca_features
#train_features.info()

In [None]:
target_scored_cols = train_targets_scored.columns.tolist()[1:]
target_nonscored_cols = train_targets_nonscored.columns.tolist()[1:]
train_df = train_features.merge(train_targets_scored,on='sig_id',how='left').merge(
           train_targets_nonscored,on='sig_id',how='left').merge(
           drug_ids,on='sig_id',how='left')
 
df = train_df.sample(frac=1.,random_state=2020)

df['kfold_scored'] = -1
kf = MultilabelStratifiedKFold(n_splits=5)
y = df[target_scored_cols + ['drug_id']].values
for fold, (t_,v_) in enumerate(kf.split(X=df,y=y)):
    df.loc[v_,'kfold_scored'] = fold
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold    
df['kfold_nonscored'] = -1
kf = MultilabelStratifiedKFold(n_splits=5)
#kf = StratifiedKFold(n_splits=5)
y = df[target_nonscored_cols + target_scored_cols + ['drug_id']].values
for fold, (t_,v_) in enumerate(kf.split(X=df,y=y)):
    df.loc[v_,'kfold_nonscored'] = fold
    
#???df.to_csv    

In [None]:
test_sig_ids = test_features[test_features['cp_type'] == 'ctl_vehicle']['sig_id'].values
#len(target_cols)
cat_names =  ['cp_type', 'cp_time', 'cp_dose']# + ['n']
cont_names = [c for c in train_features.columns if c not in cat_names 
               and c != 'sig_id'
               and c != 'drug_id']  #+ pca_names#+ ['n']

In [None]:
all_features = pd.concat([train_features[cont_names],test_features[cont_names]])
mean  = all_features.mean().values
std = all_features.std().values

In [None]:
from torch.nn import BCELoss


class LabelSmoothingCrossEntropyEgm(nn.Module):
    y1 = None
    lam = None
    def __init__(self, smoothing:float=0.005, reduction='mean'):
        super().__init__()
        self.smoothing,self.reduction = smoothing,reduction
        self.f = BCELoss(reduction = 'none')
        
    
    def forward(self, output, target):
        
        c = output.size()[-1]
        target=target.float()
        with torch.no_grad():
            target = target * (1.0 - self.smoothing) + 0.08 * self.smoothing
        bce_loss = self.f(output, target)
              
        if (self.y1 == None):
            loss = bce_loss.mean()
        
            return loss
        else:
            target1=self.y1.float()
            with torch.no_grad():
                 target1 = target1 * (1.0 - self.smoothing) + 0.08 * self.smoothing
            bce_loss1 = self.f(output, target1)
            all_loss = torch.lerp(bce_loss, bce_loss1, self.lam)
            return all_loss.mean()
    
    
 #0.001 0.05   
 #0.005 0.08
class ChangeLoss(Callback):
    _order = 90 #Runs after normalization and cuda
    
    valid_loss = BCELossFlat()
    train_loss = LabelSmoothingCrossEntropyEgm()
    
    def before_batch(self, **kwargs):
        val_condition = (self.learn.dls[1] == self.learn.dl)
        
        if (val_condition):
            self.learn.loss_func = self.valid_loss
        else:
            self.learn.loss_func = self.train_loss
class NormalizeCallback(Callback):
    def before_batch(self, **kwargs):
         
        
        a_cat, a_cont = self.learn.xb
        x_cont = a_cont.cpu()
        #x_cont[:, -360:] = (a_cont[:,-360:].cpu() - pca_mean)/pca_std
        x_cont = (a_cont.cpu() - mean)/std
        x_cont = x_cont.float()
        x_cont = x_cont.to(a_cont.device)
        
        self.learn.xb = (a_cat, x_cont)
a, b = None,None
class CatMixUp(Callback):
    #run_after,run_valid = [Normalize],False
    run_before = [Normalize]
    def __init__(self, alpha=0.4): self.distrib = Beta(tensor(alpha), tensor(alpha))
    
    def before_batch(self, **kwargs):
        ret_condition = (self.learn.dls[0] != self.learn.dl)
        
        if (ret_condition): #if not train do nothing
            return 
        #global a,b
        a=self.learn.xb 
        b, =self.learn.yb
        #print(3/0)
        lam = self.distrib.sample((self.y.size(0),)).unsqueeze(-1)

        a_cat, a_cont = a
        x_cat = a_cat.detach().clone()
        x_cont = a_cont.detach().clone()
        y=b.detach().clone()

        codes = x_cat[:,0]*16 + x_cat[:,1]*4 +x_cat[:,2]# categories common code
        uniq_codes = torch.unique(codes)
        for code in uniq_codes:
            indexes = (codes==code).nonzero().view(-1) # at which indexes
            ind_perm = torch.randperm(len(indexes))
            x_cont[indexes] = x_cont[indexes[ind_perm]]
            y[indexes] = y[indexes[ind_perm]]
        
        x_cont = x_cont.to(a_cont.device)
        y=y.to(a_cont.device)
        lam=lam.to(a_cont.device)
        out_cont = torch.lerp(a_cont, x_cont, lam) 
        #####772+100+300+60
        out_cont[:,772+100:772+100+300]=torch.tensor(pca_genes.transform(out_cont[:,:772].tolist())[:,:300])
        out_cont[:,-60:]=torch.tensor(pca_cells.transform(out_cont[:, 772:772+100].tolist()))
        #####
        
        
        
        out_y = torch.lerp(b.float(), y.float(), lam)
        out_cont = out_cont.to(a_cont.device)
        out_y = out_y.to(b.device)
        self.learn.loss_func.lam = lam
        self.learn.loss_func.y1 = y
        self.learn.xb = (a_cat, out_cont)
        self.learn.yb = (out_y,)        

seeds = [42, 7, 9, 13, 37, 11, 5, 29, 31, 37, 41, 53]          
BCE_LOSS = BCELoss(reduction='mean')#????

In [None]:
def get_data(fold, target_names, 
             procs = [Categorify, FillMissing],
             cat_names = cat_names,
             cont_names = cont_names):
    val_idx = df[df.kfold_nonscored==fold].index
    dls = TabularDataLoaders.from_df(df, path=PATH, 
                                        y_names=target_names,
                                        cat_names = cat_names,
                                        cont_names = cont_names,
                                        procs = procs,#, Normalize],
                                        valid_idx=val_idx,                                        
                                        bs=64)
    return dls
def get_cbs(do_mixup):
    ncb = NormalizeCallback()
    ch_loss_cb = ChangeLoss()    
    if do_mixup:
        return  [ch_loss_cb,CatMixUp(),ncb]
        #return  [ch_loss_cb,ncb] <- when I talked about scores I've changed exactly this line....
    return [ncb, ch_loss_cb]

test_scores = []
results = []

def do_train_and_inf(num_iters=1, do_train=True, do_inference=False, cbs=get_cbs(False), 
                     lr=9e-3, epochs=5, target_names=target_scored_cols, 
                     use_pretr=False, pretr_model=False, file_name = 'something'):
    global test_scores
    global results
    model = None
    for ind in tqdm(range(num_iters)):
        seed_everything(seeds[ind])
        i = ind % 5 
        dls = get_data(i, target_names = target_names) # Data
        model_dir = '/kaggle/working/' if TRAIN else '/kaggle/input/fastai-egm'
        config = tabular_config(ps=0.2)
        learn = tabular_learner(dls , y_range=(0,1), 
                                layers = [1024, 512, 512, 256],                                
                                loss_func = LabelSmoothingCrossEntropyEgm(),
                                config=config,
                                model_dir=model_dir,
                            cbs=cbs
                           ) # Model
        model = learn.model
    
        if (use_pretr):
            print("Will change model")
            remember = learn.model.layers[-2]    
            learn.model = copy.deepcopy(pretr_model)#.load_state_dict(torch.load('/kaggle/working/pretrained'))
            learn.model.layers[-2] = remember

        name = file_name + str(ind)
    
        cb = SaveModelCallback(monitor='valid_loss',fname=name ,mode='min') # Callbacks    
        if (do_train):
           
            learn.fit_one_cycle(epochs, lr=slice(lr/(2.6**4),lr), cbs=cb) # Training
            results = results + [learn.recorder.loss.value.item()]
            
        if (do_inference):
            learn.load(name) # Load best model
                
            test_dl = learn.dls.test_dl(test_features)#learn.dls.valid#learn.valid_dllearn.dls.test_dl(test_features)
            sub = learn.get_preds(dl=test_dl) # prediction
            test_scores.append(sub[0].numpy())
    
    #if TRAIN:
        #learn.export('/kaggle/working/'+name+'.pkl') # export model
    
    return model
    
    

In [None]:
model = do_train_and_inf(num_iters=1, do_train = True, do_inference=False, 
                         cbs = get_cbs(False), lr = 9e-3, epochs = 5, 
                        target_names = target_nonscored_cols + target_scored_cols,
                        use_pretr = False, pretr_model = None,
                        file_name = 'pretrain_')


In [None]:
do_train_and_inf(num_iters=10, do_train = True, do_inference=True, 
                cbs = get_cbs(True), lr = 9e-3, epochs = 10, 
                target_names = target_scored_cols,
                use_pretr = True, pretr_model = model,
                file_name = 'pretrain_')
test_sc = np.array(test_scores)

In [None]:
for r in results:
    print(str(r).replace('.',','))

In [None]:
avg_prds = test_sc.mean(axis=0)
submission = sample_submission.copy()
submission[target_scored_cols] = avg_prds
submission.loc[submission['sig_id'].isin(test_features.loc[test_features['cp_type'] =='ctl_vehicle', 'sig_id']), train_targets_scored.columns[1:]] = 0
#submission['atp-sensitive_potassium_channel_antagonist'] = 0
#submission['erbb2_inhibitor'] = 0
submission.to_csv('submission.csv', index=False)