# Modular Blend

The following offers some minor improvements on existing blend kernels, and has been refactored for readability.

## Improvements

* Easy weighting of predictions
* Easy modification of blend technique

# Credit

Thanks to these kernels:

* [https://www.kaggle.com/saurabh502/why-no-blend](https://www.kaggle.com/saurabh502/why-no-blend)
* [https://www.kaggle.com/roydatascience/new-blend](https://www.kaggle.com/roydatascience/new-blend)
* [https://www.kaggle.com/ashishpatel26/two-style-of-blending-and-double-blend](https://www.kaggle.com/ashishpatel26/two-style-of-blending-and-double-blend)

In [None]:
import numpy as np
import pandas as pd
import os
from scipy.stats import rankdata

In [None]:
LABELS = ["HasDetections"]

In [None]:
!ls ../input

In [None]:
# specify which predictions you want to blend. assign a weight to each
preds_df = pd.DataFrame([
    {'fn': '../input/detecting-malwares-with-ftrl-proximal/submission.csv', 'weight': 1},
    {'fn': '../input/hung-the-nguyen/submission_lgbm_5.csv', 'weight': 0.3},
    {'fn': '../input/hung-the-nguyen/submission_lgbm_6.csv', 'weight': 0.3},
    {'fn': '../input/hung-the-nguyen/submission_lgbm_7.csv', 'weight': 0.3},
    {'fn': '../input/hung-the-nguyen/submission_lgbm_8.csv', 'weight': 0.3},
    {'fn': '../input/hung-the-nguyen/submission_lgbm_9.csv', 'weight': 0.8},
    #{'fn': '../input/is-this-malware-eda-fe-and-lgb-updated/lgb_rank.csv', 'weight': 1},
    {'fn': '../input/malware-predictions-3500-trees/submission0.72968.csv', 'weight': 1},
    {'fn': '../input/outputs-for-microsoft/submission_ashish_kfold.csv', 'weight': 1}
])

preds_df.head(10)

In [None]:
# read the data in
preds_df['data'] = preds_df['fn'].apply(lambda f: pd.read_csv(f)[LABELS].values)
preds_df.head()

In [None]:
# blend the predictions using the specified function foo
def blend(preds_df, foo, save_as=None):
    result = np.zeros_like(preds_df.loc[0, 'data'])
    for idx, row in preds_df.iterrows():
        print(f'Blending: {row["fn"]}...')
        result += (row['weight'] * foo(row['data']))
    result /= preds_df['weight'].sum()
    
    if save_as != None: 
        save_submission(result, save_as)
    return result

def save_submission(predictions, save_as):
    submission = pd.read_csv('../input/microsoft-malware-prediction/sample_submission.csv')
    submission[LABELS] = predictions
    submission.to_csv(save_as, index=False)
    print(f'Saved submission file: {save_as}')

In [None]:
# these functions can be passed as an argument to blend

def norm_rank(preds):
    """returns normalized rank"""
    result = rankdata(preds) / len(preds)
    return np.reshape(result, (len(result), 1))

# Add any other blending functions you want to try, eg.

def pass_pred(preds):
    """simply returns the prediction as-is"""
    return preds

def avg_pred_rank(preds):
    """returns the average of the prediction and its normalized rank"""
    nr = norm_rank(preds)
    return (preds + nr) / 2

In [None]:
# get blend predictions
# you can swap our norm_rank for another implemented method
blend_norm_rank = blend(preds_df, norm_rank, 'blend_norm_rank.csv')
print(blend_norm_rank[:5])

blend_avg_pred = blend(preds_df, pass_pred, 'blend_avg_pred.csv')
print(blend_avg_pred[:5])

blend_avg_pred_rank = blend(preds_df, avg_pred_rank, 'blend_avg_pred_rank.csv')
print(blend_avg_pred_rank[:5])