In [1]:
from google.colab import drive
drive.mount("/content/drive")

import sys
sys.path.insert(0,'/content/drive/My Drive/Colab Notebooks/deeplens/')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, 
and then re-execute this cell.
Your runtime has 13.6 gigabytes of available RAM

To enable a high-RAM runtime, select the Runtime > "Change runtime type"
menu, and then select High-RAM in the Runtime shape dropdown. Then, 
re-execute this cell.


In [3]:
from IPython.display import clear_output
!pip3 install torch==1.2.0+cu92 torchvision==0.4.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html
!pip3 install lenstronomy
!pip install h5py
!pip install --upgrade tables
!pip install --upgrade pandas tables
!pip install -U PyYAML
!conda install numba & conda install cudatoolkit

%cd  drive/MyDrive/'Colab Notebooks'/deeplens/fastell4py/
! python setup.py install
%cd ..
clear_output
!pip install scikit-multilearn

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Requirement already up-to-date: tables in /usr/local/lib/python3.7/dist-packages (3.6.1)
Requirement already up-to-date: pandas in /usr/local/lib/python3.7/dist-packages (1.2.4)
Requirement already up-to-date: tables in /usr/local/lib/python3.7/dist-packages (3.6.1)
Requirement already up-to-date: PyYAML in /usr/local/lib/python3.7/dist-packages (5.4.1)
/bin/bash: conda: command not found
/bin/bash: conda: command not found
/content/drive/MyDrive/Colab Notebooks/deeplens/fastell4py
[39mrunning install[0m
[39mrunning bdist_egg[0m
[39mrunning egg_info[0m
[39mrunning build_src[0m
[39mbuild_src[0m
[39mbuilding extension "fastell4py._fastell" sources[0m
[39mf2py options: [][0m
[39m  adding 'build/src.linux-x86_64-3.7/build/src.linux-x86_64-3.7/fortranobject.c' to sources.[0m
[39m  adding 'build/src.linux-x86_64-3.7/build/src.linux-x86_64-3.7' to include_dirs.[0m
[39m  adding 'build/src.linux-x86_64-3.7/_

# Ensemble Machine learning

In this notebook, we try to improve predictions through ensemble machine learning's method. The combination of uncorrelated models bring a less biased estimations which increase the metrics. Increasing the number of models should bring better results for future application.

    * Problem transformation :
        Binary relevance
    * Classifiers :
        Ridge Regression 
            AUC : 
            AUC mass : - AUC source :
        Random forest
            AUC : 
            AUC mass : - AUC source :
        Naive Bayes
            AUC : 
            AUC mass : - AUC source :

## 0. Import


In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import DataLoader
import torch.nn as nn


from helpers.data_generation.file_management import read_hdf5
from helpers.data_generation.error_generation_chi2 import Residual, CombineDataset
from helpers.model.helpers_model import NeuralNet

import warnings
warnings.filterwarnings('ignore')


from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import cross_val_score, RepeatedKFold, KFold

## 1. Predictions Generation

In [5]:
ratio = 0.75
percent = np.array([0.005, 0.015, 0.005])
size = 10000

batch_size = 64

res = Residual()
res.build(size, ratio = ratio, per_error = percent)

In [6]:
str_ID =  "S"+str(size)+"R"+str(int(ratio*100))
[final_array, metadata] = read_hdf5(str_ID)
metadata ['ID'] = np.arange(0,final_array.shape[0])
data_set = CombineDataset(metadata,'ID','class',final_array)

_, data_test = train_test_split(data_set,train_size=0.85,random_state=42)
loader_test = DataLoader(data_test, batch_size = batch_size, 
                         num_workers = 8, drop_last=True, pin_memory = True)

In [7]:
models_name = ['BasicCNN', 'SqueezeNet', 'AlexNet', 'ResNet18', 'GoogleNet', 'VGG11', 'DenseNet121']


mass_pred = pd.DataFrame()
source_pred = pd.DataFrame()
for model_i in models_name:
    netbasic = NeuralNet(model_i, 'SGD/momentum')
    netbasic.load_checkpoint('_optimal')

    with torch.no_grad():
        predictions = []; targets = []
        for data in loader_test:
            images, _, labels = data
            if model_i is not 'BasicCNN':
                m = nn.ZeroPad2d(80)
                images = m(images)
            
            if model_i is not 'GoogleNet':
                outputs = netbasic.net(images)
            else :
                outputs,_,_ = netbasic.net(images)


            predictions.extend(outputs.cpu().numpy())
            targets.extend(labels.cpu().numpy())
    
    mass_pred[model_i] = np.asarray(predictions)[:,0].tolist()
    source_pred[model_i] = np.asarray(predictions)[:,1].tolist()

mass_pred['label'] = np.asarray(targets)[:,0].tolist()
source_pred['label'] = np.asarray(targets)[:,1].tolist()



## 1. Source Error

### 1.1 Parameters selection

##### Random Forest


    depth : 
    number of estimator : 
    maximum features : 

In [None]:
max = 0; max_depth = 1; max_estimator = 100; max_features = 2;
k = 0
cv = KFold(n_splits=10)
while k < 20:
    for i in range(1,200):
        clf = RandomForestClassifier(max_depth=i, n_estimators=max_estimator, max_features=max_features)
        # Perform 7-fold cross validation 
        scores = cross_val_score(clf,source_pred[models_name], source_pred['label'], scoring='roc_auc', cv=cv, n_jobs=-1)
        if max < np.mean(scores):
            max =  np.mean(scores); 
            max_depth =  i; 
    
    for i in range(1,200):
        clf = RandomForestClassifier(max_depth=max_depth, n_estimators=i, max_features=max_features)
        # Perform 7-fold cross validation 
        scores = cross_val_score(clf,source_pred[models_name], source_pred['label'], scoring='roc_auc', cv=cv, n_jobs=-1)
        if max < np.mean(scores):
            max =  np.mean(scores); 
            max_estimator =  i; 

    for i in range(1,200):
        clf = RandomForestClassifier(max_depth=max_depth, n_estimators=max_estimator, max_features=i)
        # Perform 7-fold cross validation 
        scores = cross_val_score(clf,source_pred[models_name], source_pred['label'], scoring='roc_auc', cv=cv, n_jobs=-1)
        if max < np.mean(scores):
            max =  np.mean(scores); 
            max_features =  i; 
    k += 1


print('Optimal max depth : ', max_depth, '- Optimal number of estimators :', max_estimator, ' - Optimal maximum number of features: ', max_features)
print('Accuracy', max)

##### Ridge Regression


    lambda : 
    weights :

In [None]:
# Cross validation on a ridge regression with ten fold
cv_ridge = KFold(n_splits=10)
gs_ridge = RidgeCV(alphas = [i for i in np.logspace(-2,1, 1000)], fit_intercept = False, scoring = 'roc_auc', cv = cv_ridge)

# Fit the ridge regression
gs_ridge.fit( source_pred[models_name], source_pred['label'])

print("Best lambda :", gs_ridge.alpha_, "\n")
print("Optimal weight:", gs_ridge.coef_, "\n")

### 1.2 Final results

    Ridge : 
        AUC :
    RandomForest :
        AUC :
    Naive Bayes :
        AUC :


In [None]:
classifiers = { 'Ridge':Ridge(alpha=gs_ridge.alpha_),
               'RandomForest':RandomForestClassifier(max_depth=max_depth, n_estimators=max_estimator, max_features=max_features),
               'GaussianNB': GaussianNB(),
               }

cv = KFold(n_splits=10)
for key in classifiers:
    classifier = classifiers[key]
    # evaluate model
    scores = cross_val_score(classifier, source_pred[models_name], source_pred['label'], scoring='roc_auc', cv=cv, n_jobs=-1)
    # predict
    AUC = np.mean(scores)
    print('Binary Relevance - ', key , ': ', AUC)

## 2. Mass error

### 2.1 Parameters selection

##### Random Forest


    depth : 
    number of estimator : 
    maximum features : 

In [None]:
max = 0; max_depth = 1; max_estimator = 100; max_features = 2;
k = 0
cv = KFold(n_splits=10)
while k < 20:
    for i in range(1,200):
        clf = RandomForestClassifier(max_depth=i, n_estimators=max_estimator, max_features=max_features)
        # Perform 7-fold cross validation 
        scores = cross_val_score(clf,mass_pred[models_name], mass_pred['label'], scoring='roc_auc', cv=cv, n_jobs=-1)
        if max < np.mean(scores):
            max =  np.mean(scores); 
            max_depth =  i; 
    
    for i in range(1,200):
        clf = RandomForestClassifier(max_depth=max_depth, n_estimators=i, max_features=max_features)
        # Perform 7-fold cross validation 
        scores = cross_val_score(clf,mass_pred[models_name], mass_pred['label'], scoring='roc_auc', cv=cv, n_jobs=-1)
        if max < np.mean(scores):
            max =  np.mean(scores); 
            max_estimator =  i; 

    for i in range(1,200):
        clf = RandomForestClassifier(max_depth=max_depth, n_estimators=max_estimator, max_features=i)
        # Perform 7-fold cross validation 
        scores = cross_val_score(clf,mass_pred[models_name], mass_pred['label'], scoring='roc_auc', cv=cv, n_jobs=-1)
        if max < np.mean(scores):
            max =  np.mean(scores); 
            max_features =  i; 
    k += 1


print('Optimal max depth : ', max_depth, '- Optimal number of estimators :', max_estimator, ' - Optimal maximum number of features: ', max_features)
print('Accuracy', max)

##### Ridge Regression

    lambda : 
    weights :

In [None]:
# Cross validation on a ridge regression with ten fold
cv_ridge = KFold(n_splits=10)
gs_ridge = RidgeCV(alphas = [i for i in np.logspace(-2,1, 1000)], fit_intercept = False, scoring = 'roc_auc', cv = cv_ridge)

# Fit the ridge regression
gs_ridge.fit( mass_pred[models_name], mass_pred['label'])

print("Best lambda :", gs_ridge.alpha_, "\n")
prin ("Optimal weight:", gs_ridge.coef_, "\n")

### 1.2 Final results
    Ridge : 
        AUC :
    RandomForest :
        AUC :
    Naive Bayes :
        AUC :

In [None]:
classifiers = { 'Ridge':Ridge(alpha=gs_ridge.alpha_),
               'RandomForest':RandomForestClassifier(max_depth=max_depth, n_estimators=max_estimator, max_features=max_features),
               'GaussianNB': GaussianNB(),
               }

cv = KFold(n_splits=10)
for key in classifiers:
    classifier = classifiers[key]
    # evaluate model
    scores = cross_val_score(classifier, mass_pred[models_name], mass_pred['label'], scoring='roc_auc', cv=cv, n_jobs=-1)
    # predict
    AUC = np.mean(scores)
    print('Binary Relevance - ', key , ': ', AUC)

### 4. Label Power Set