# Ensemble

In [16]:
from __future__      import division
from IPython.display import display
from matplotlib      import pyplot as plt
%matplotlib inline

import numpy  as np
import pandas as pd
import random, sys, os, re

# The test set has duplicates so we get the list of IDs in the sample file in order

In [17]:
id_list = []
with open('../submissions/Submission_Format.csv', 'r') as f:
    lines = f.read().splitlines()
    for line in lines:
        ID,prob = line.split(',')
        if ID == '': continue
        id_list.append(ID)

In [18]:
def get_filepaths(directory):
    """
    This function will generate the file names in a directory 
    tree by walking the tree either top-down or bottom-up. For each 
    directory in the tree rooted at directory top (including top itself), 
    it yields a 3-tuple list (dirpath, dirnames, filenames).
    """
    import os
    
    file_paths = []  # List which will store all of the full filepaths.

    # Walk the tree.
    for root, directories, files in os.walk(directory):
        for filename in files:
            # Join the two strings in order to form the full filepath.
            filepath = os.path.join(root, filename)
            file_paths.append(filepath)  # Add it to the list.

    return file_paths 

# Get the list of submission files 

* ## remove the example file 
* ## and all ensembles

In [19]:
file_list = get_filepaths('../submissions')
file_list

['../submissions/submission_0.4457_LogisticRegressionCV.csv',
 '../submissions/submission_0.5670_SGDClassifier.csv',
 '../submissions/submission_1.7907_RandomForestClassifier.csv',
 '../submissions/submission_voting_ensemble_soft.csv',
 '../submissions/Submission_Format.csv',
 '../submissions/submission_EnsembleOfAveragesBEST.csv',
 '../submissions/submission_0.4885_BaggingClassifier.csv',
 '../submissions/submission_boosted_svc.csv',
 '../submissions/submission_0.4566_nolearn.csv',
 '../submissions/submission_voting_ensemble_hard.csv',
 '../submissions/submission_0.4851_XGBClassifier_vanilla.csv',
 '../submissions/submission_EnsembleOfAveragesALL.csv',
 '../submissions/submission_0.6642_AdaBoostClassifier.csv',
 '../submissions/submission_0.4411_LogisticRegression.csv',
 '../submissions/submission_0.5336_SVC.csv',
 '../submissions/submission_0.5732_cosine_similarity.csv',
 '../submissions/submission_0.4648_GradientBoostingClassifier_engineering.csv',
 '../submissions/submission_0.4608

In [20]:
# why do it more than once? For some reason it doesn't work if only run once. Who knows?
# ======================================================================================
for i in range(3):
    for file_name in file_list:
        if 'Format'   in file_name: file_list.remove(file_name)
        if 'Ensemble' in file_name: file_list.remove(file_name)
        if 'ensemble' in file_name: file_list.remove(file_name)
    
file_list

['../submissions/submission_0.4457_LogisticRegressionCV.csv',
 '../submissions/submission_0.5670_SGDClassifier.csv',
 '../submissions/submission_1.7907_RandomForestClassifier.csv',
 '../submissions/submission_0.4885_BaggingClassifier.csv',
 '../submissions/submission_boosted_svc.csv',
 '../submissions/submission_0.4566_nolearn.csv',
 '../submissions/submission_0.4851_XGBClassifier_vanilla.csv',
 '../submissions/submission_0.6642_AdaBoostClassifier.csv',
 '../submissions/submission_0.4411_LogisticRegression.csv',
 '../submissions/submission_0.5336_SVC.csv',
 '../submissions/submission_0.5732_cosine_similarity.csv',
 '../submissions/submission_0.4648_GradientBoostingClassifier_engineering.csv',
 '../submissions/submission_0.4608_GradientBoostingClassifier.csv',
 '../submissions/submission_bagged_gbc.csv',
 '../submissions/submission_0.4452_GradientBoostingClassifier_exponential.csv',
 '../submissions/submission_0.6289_KMeans.csv',
 '../submissions/submission_0.4729_ExtraTreesClassifier.c

# -------------- Ensemble ALL the submissions --------------

# Find the average probability for all IDs

In [21]:
from collections import defaultdict

aggregates = defaultdict(list)
averages   = defaultdict(list)


# 1. collect the probabilities for each ID from all the submission files
# ======================================================================
for file_name in file_list:
    with open(file_name, 'r') as f:
        lines = f.read().splitlines()
        for line in lines:
            ID,prob = line.split(',')
            if ID == '': continue
            aggregates[ID].append(prob)
            
        
            
# 2. find the average of all the probabilities for each ID
# ========================================================
averages.update((ID, np.mean(map(float, probs))) for ID, probs in aggregates.items())

aggregates['1'],averages['1']

(['0.355096586523',
  '0.35',
  '0.1',
  '0.252661172161',
  '0.229568971977',
  '0.458084821701',
  '0.173246413469',
  '0.494101309394',
  '0.308123096572',
  '0.201544480802',
  '0.35',
  '0.360423053959',
  '0.40501948695',
  '0.344323032548',
  '0.296876234606',
  '0.35',
  '0.184227189057',
  '0.222222222222'],
 0.30197322621894451)

In [22]:
len(aggregates),len(averages)

(172, 172)

# Create a submission file of the ensemble of averages

In [23]:
f = open("../submissions/submission_EnsembleOfAveragesALL.csv", "w")

f.write(",Made Donation in March 2007\n")
for ID in id_list:
    f.write("{},{}\n".format(ID, averages[ID]))
    
f.close()

In [24]:
!ls -l ../submissions/

total 192
-rw-rw-r-- 1 george george 3826 Nov 14 18:55 submission_0.4411_LogisticRegression.csv
-rw-rw-r-- 1 george george 3859 Nov 13 18:39 submission_0.4452_GradientBoostingClassifier_exponential.csv
-rw-rw-r-- 1 george george 3821 Nov 14 19:12 submission_0.4457_LogisticRegressionCV.csv
-rw-rw-r-- 1 george george 3814 Nov 13 16:09 submission_0.4566_nolearn.csv
-rw-rw-r-- 1 george george 3849 Nov 11 15:46 submission_0.4608_GradientBoostingClassifier.csv
-rw-rw-r-- 1 george george 3790 Nov 13 10:49 submission_0.4648_GradientBoostingClassifier_engineering.csv
-rw-rw-r-- 1 george george 3798 Nov 11 19:28 submission_0.4729_ExtraTreesClassifier.csv
-rw-rw-r-- 1 george george 3835 Nov 11 15:14 submission_0.4851_XGBClassifier_vanilla.csv
-rw-rw-r-- 1 george george 3701 Nov 14 10:03 submission_0.4885_BaggingClassifier.csv
-rw-rw-r-- 1 george george 3829 Nov 11 18:35 submission_0.4896_EnsembleOfAverages.csv
-rw-rw-r-- 1 george george 3787 Nov 14 15:22 submission_0.5336_SVC.csv
-rw-

# ---------- Ensemble the submissions with a score of 0.4... or 0.3... ----------

In [25]:
file_list

['../submissions/submission_0.4457_LogisticRegressionCV.csv',
 '../submissions/submission_0.5670_SGDClassifier.csv',
 '../submissions/submission_1.7907_RandomForestClassifier.csv',
 '../submissions/submission_0.4885_BaggingClassifier.csv',
 '../submissions/submission_boosted_svc.csv',
 '../submissions/submission_0.4566_nolearn.csv',
 '../submissions/submission_0.4851_XGBClassifier_vanilla.csv',
 '../submissions/submission_0.6642_AdaBoostClassifier.csv',
 '../submissions/submission_0.4411_LogisticRegression.csv',
 '../submissions/submission_0.5336_SVC.csv',
 '../submissions/submission_0.5732_cosine_similarity.csv',
 '../submissions/submission_0.4648_GradientBoostingClassifier_engineering.csv',
 '../submissions/submission_0.4608_GradientBoostingClassifier.csv',
 '../submissions/submission_bagged_gbc.csv',
 '../submissions/submission_0.4452_GradientBoostingClassifier_exponential.csv',
 '../submissions/submission_0.6289_KMeans.csv',
 '../submissions/submission_0.4729_ExtraTreesClassifier.c

In [26]:
# why do it more than once? For some reason it doesn't work if only run once. Who knows?
# ======================================================================================
for i in range(3):
    for file_name in file_list:
        if 'Format'   in file_name: file_list.remove(file_name)
        if 'Ensemble' in file_name: file_list.remove(file_name)

        # scores of 0.4... or 0.3... are good
        # files with SEED... are good-scoring models that were re-run with different random seeds
        if ('0.4' not  in file_name) and \
           ('0.3' not  in file_name) and \
           ('SEED' not in file_name):       
                file_list.remove(file_name)
    
file_list

['../submissions/submission_0.4457_LogisticRegressionCV.csv',
 '../submissions/submission_0.4885_BaggingClassifier.csv',
 '../submissions/submission_0.4566_nolearn.csv',
 '../submissions/submission_0.4851_XGBClassifier_vanilla.csv',
 '../submissions/submission_0.4411_LogisticRegression.csv',
 '../submissions/submission_0.4648_GradientBoostingClassifier_engineering.csv',
 '../submissions/submission_0.4608_GradientBoostingClassifier.csv',
 '../submissions/submission_0.4452_GradientBoostingClassifier_exponential.csv',
 '../submissions/submission_0.4729_ExtraTreesClassifier.csv']

In [27]:
from collections import defaultdict

aggregates = defaultdict(list)
averages   = defaultdict(list)


# 1. collect the probabilities for each ID from all the submission files
# ======================================================================
for file_name in file_list:
    with open(file_name, 'r') as f:
        lines = f.read().splitlines()
        for line in lines:
            ID,prob = line.split(',')
            if ID == '': continue
            aggregates[ID].append(prob)
            
        
            
# 2. find the average of all the probabilities for each ID
# ========================================================
averages.update((ID, np.mean(map(float, probs))) for ID, probs in aggregates.items())

aggregates['1'],averages['1']

(['0.355096586523',
  '0.252661172161',
  '0.458084821701',
  '0.173246413469',
  '0.308123096572',
  '0.360423053959',
  '0.40501948695',
  '0.296876234606',
  '0.184227189057'],
 0.31041756166644446)

In [28]:
len(aggregates),len(averages)

(172, 172)

In [29]:
f = open("../submissions/submission_EnsembleOfAveragesBEST.csv", "w")

f.write(",Made Donation in March 2007\n")
for ID in id_list:
    f.write("{},{}\n".format(ID, averages[ID]))
    
f.close()

In [30]:
!ls -l ../submissions/

total 192
-rw-rw-r-- 1 george george 3826 Nov 14 18:55 submission_0.4411_LogisticRegression.csv
-rw-rw-r-- 1 george george 3859 Nov 13 18:39 submission_0.4452_GradientBoostingClassifier_exponential.csv
-rw-rw-r-- 1 george george 3821 Nov 14 19:12 submission_0.4457_LogisticRegressionCV.csv
-rw-rw-r-- 1 george george 3814 Nov 13 16:09 submission_0.4566_nolearn.csv
-rw-rw-r-- 1 george george 3849 Nov 11 15:46 submission_0.4608_GradientBoostingClassifier.csv
-rw-rw-r-- 1 george george 3790 Nov 13 10:49 submission_0.4648_GradientBoostingClassifier_engineering.csv
-rw-rw-r-- 1 george george 3798 Nov 11 19:28 submission_0.4729_ExtraTreesClassifier.csv
-rw-rw-r-- 1 george george 3835 Nov 11 15:14 submission_0.4851_XGBClassifier_vanilla.csv
-rw-rw-r-- 1 george george 3701 Nov 14 10:03 submission_0.4885_BaggingClassifier.csv
-rw-rw-r-- 1 george george 3829 Nov 11 18:35 submission_0.4896_EnsembleOfAverages.csv
-rw-rw-r-- 1 george george 3787 Nov 14 15:22 submission_0.5336_SVC.csv
-rw-