# Ensemble

In [1]:
from __future__      import division
from IPython.display import display
from matplotlib      import pyplot as plt
%matplotlib inline

import numpy  as np
import pandas as pd
import random, sys, os, re

# The test set has duplicates so we get the list of IDs in the sample file in order

In [2]:
id_list = []
with open('../submissions/Submission_Format.csv', 'r') as f:
    lines = f.read().splitlines()
    for line in lines:
        ID,prob = line.split(',')
        if ID == '': continue
        id_list.append(ID)

In [3]:
def get_filepaths(directory):
    """
    This function will generate the file names in a directory 
    tree by walking the tree either top-down or bottom-up. For each 
    directory in the tree rooted at directory top (including top itself), 
    it yields a 3-tuple list (dirpath, dirnames, filenames).
    """
    import os
    
    file_paths = []  # List which will store all of the full filepaths.

    # Walk the tree.
    for root, directories, files in os.walk(directory):
        for filename in files:
            # Join the two strings in order to form the full filepath.
            filepath = os.path.join(root, filename)
            file_paths.append(filepath)  # Add it to the list.

    return file_paths 

# Get the list of submission files 

* ## remove the example file 
* ## and all ensembles

In [4]:
file_list = get_filepaths('../submissions')
file_list

['../submissions/submission_0.5670_SGDClassifier.csv',
 '../submissions/submission_1.7907_RandomForestClassifier.csv',
 '../submissions/submission_KNeighborsClassifier.csv',
 '../submissions/submission_KMeans.csv',
 '../submissions/Submission_Format.csv',
 '../submissions/submission_0.4566_nolearn.csv',
 '../submissions/submission_BaggingClassifier.csv',
 '../submissions/submission_0.4851_XGBClassifier_vanilla.csv',
 '../submissions/submission_0.6642_AdaBoostClassifier.csv',
 '../submissions/submission_0.5732_cosine_similarity.csv',
 '../submissions/submission_0.4648_GradientBoostingClassifier_engineering.csv',
 '../submissions/submission_0.4608_GradientBoostingClassifier.csv',
 '../submissions/submission_GradientBoostingClassifier_exponential.csv',
 '../submissions/submission_0.4896_EnsembleOfAverages.csv',
 '../submissions/submission_0.4729_ExtraTreesClassifier.csv',
 '../submissions/submission_1.1870_KNeighborsClassifier.csv']

In [5]:
# why do it more than once? For some reason it doesn't work if only run once. Who knows?
# ======================================================================================
for i in range(3):
    for file_name in file_list:
        if 'Format'   in file_name: file_list.remove(file_name)
        if 'Ensemble' in file_name: file_list.remove(file_name)
    
file_list

['../submissions/submission_0.5670_SGDClassifier.csv',
 '../submissions/submission_1.7907_RandomForestClassifier.csv',
 '../submissions/submission_KNeighborsClassifier.csv',
 '../submissions/submission_KMeans.csv',
 '../submissions/submission_0.4566_nolearn.csv',
 '../submissions/submission_BaggingClassifier.csv',
 '../submissions/submission_0.4851_XGBClassifier_vanilla.csv',
 '../submissions/submission_0.6642_AdaBoostClassifier.csv',
 '../submissions/submission_0.5732_cosine_similarity.csv',
 '../submissions/submission_0.4648_GradientBoostingClassifier_engineering.csv',
 '../submissions/submission_0.4608_GradientBoostingClassifier.csv',
 '../submissions/submission_GradientBoostingClassifier_exponential.csv',
 '../submissions/submission_0.4729_ExtraTreesClassifier.csv',
 '../submissions/submission_1.1870_KNeighborsClassifier.csv']

# -------------- Ensemble ALL the submissions --------------

# Find the average probability for all IDs

In [6]:
from collections import defaultdict

aggregates = defaultdict(list)
averages   = defaultdict(list)


# 1. collect the probabilities for each ID from all the submission files
# ======================================================================
for file_name in file_list:
    with open(file_name, 'r') as f:
        lines = f.read().splitlines()
        for line in lines:
            ID,prob = line.split(',')
            if ID == '': continue
            aggregates[ID].append(prob)
            
        
            
# 2. find the average of all the probabilities for each ID
# ========================================================
averages.update((ID, np.mean(map(float, probs))) for ID, probs in aggregates.items())

aggregates['1'],averages['1']

(['0.35',
  '0.1',
  '0.222222222222',
  '0.35',
  '0.458084821701',
  '0.252661172161',
  '0.173246413469',
  '0.494101309394',
  '0.35',
  '0.360423053959',
  '0.40501948695',
  '0.296876234606',
  '0.184227189057',
  '0.222222222222'],
 0.30136315183864287)

In [7]:
len(aggregates),len(averages)

(172, 172)

# Create a submission file of the ensemble of averages

In [8]:
f = open("../submissions/submission_EnsembleOfAveragesALL.csv", "w")

f.write(",Made Donation in March 2007\n")
for ID in id_list:
    f.write("{},{}\n".format(ID, averages[ID]))
    
f.close()

In [9]:
!ls -l ../submissions/

total 136
-rw-rw-r-- 1 george george 3814 Nov 13 16:09 submission_0.4566_nolearn.csv
-rw-rw-r-- 1 george george 3849 Nov 11 15:46 submission_0.4608_GradientBoostingClassifier.csv
-rw-rw-r-- 1 george george 3790 Nov 13 10:49 submission_0.4648_GradientBoostingClassifier_engineering.csv
-rw-rw-r-- 1 george george 3798 Nov 11 19:28 submission_0.4729_ExtraTreesClassifier.csv
-rw-rw-r-- 1 george george 3835 Nov 11 15:14 submission_0.4851_XGBClassifier_vanilla.csv
-rw-rw-r-- 1 george george 3829 Nov 11 18:35 submission_0.4896_EnsembleOfAverages.csv
-rw-rw-r-- 1 george george 1800 Nov 13 08:52 submission_0.5670_SGDClassifier.csv
-rw-rw-r-- 1 george george 1800 Nov 13 18:26 submission_0.5732_cosine_similarity.csv
-rw-rw-r-- 1 george george 3784 Nov 13 12:12 submission_0.6642_AdaBoostClassifier.csv
-rw-rw-r-- 1 george george 3437 Nov 13 08:32 submission_1.1870_KNeighborsClassifier.csv
-rw-rw-r-- 1 george george 2283 Nov 11 16:51 submission_1.7907_RandomForestClassifier.csv
-rw-rw-r--

# ---------- Ensemble the submissions with a score of 0.4... or 0.3... ----------

In [10]:
file_list

['../submissions/submission_0.5670_SGDClassifier.csv',
 '../submissions/submission_1.7907_RandomForestClassifier.csv',
 '../submissions/submission_KNeighborsClassifier.csv',
 '../submissions/submission_KMeans.csv',
 '../submissions/submission_0.4566_nolearn.csv',
 '../submissions/submission_BaggingClassifier.csv',
 '../submissions/submission_0.4851_XGBClassifier_vanilla.csv',
 '../submissions/submission_0.6642_AdaBoostClassifier.csv',
 '../submissions/submission_0.5732_cosine_similarity.csv',
 '../submissions/submission_0.4648_GradientBoostingClassifier_engineering.csv',
 '../submissions/submission_0.4608_GradientBoostingClassifier.csv',
 '../submissions/submission_GradientBoostingClassifier_exponential.csv',
 '../submissions/submission_0.4729_ExtraTreesClassifier.csv',
 '../submissions/submission_1.1870_KNeighborsClassifier.csv']

In [16]:
# why do it more than once? For some reason it doesn't work if only run once. Who knows?
# ======================================================================================
for i in range(3):
    for file_name in file_list:
        if 'Format'   in file_name: file_list.remove(file_name)
        if 'Ensemble' in file_name: file_list.remove(file_name)

        if ('0.4' not  in file_name) and ('0.3' not  in file_name): file_list.remove(file_name)
    
file_list

['../submissions/submission_0.4566_nolearn.csv',
 '../submissions/submission_0.4851_XGBClassifier_vanilla.csv',
 '../submissions/submission_0.4648_GradientBoostingClassifier_engineering.csv',
 '../submissions/submission_0.4608_GradientBoostingClassifier.csv',
 '../submissions/submission_0.4729_ExtraTreesClassifier.csv']

In [12]:
from collections import defaultdict

aggregates = defaultdict(list)
averages   = defaultdict(list)


# 1. collect the probabilities for each ID from all the submission files
# ======================================================================
for file_name in file_list:
    with open(file_name, 'r') as f:
        lines = f.read().splitlines()
        for line in lines:
            ID,prob = line.split(',')
            if ID == '': continue
            aggregates[ID].append(prob)
            
        
            
# 2. find the average of all the probabilities for each ID
# ========================================================
averages.update((ID, np.mean(map(float, probs))) for ID, probs in aggregates.items())

aggregates['1'],averages['1']

(['0.35',
  '0.458084821701',
  '0.173246413469',
  '0.360423053959',
  '0.40501948695',
  '0.184227189057'],
 0.32183349418933332)

In [13]:
len(aggregates),len(averages)

(172, 172)

In [14]:
f = open("../submissions/submission_EnsembleOfAveragesBEST.csv", "w")

f.write(",Made Donation in March 2007\n")
for ID in id_list:
    f.write("{},{}\n".format(ID, averages[ID]))
    
f.close()

In [15]:
!ls -l ../submissions/

total 140
-rw-rw-r-- 1 george george 3814 Nov 13 16:09 submission_0.4566_nolearn.csv
-rw-rw-r-- 1 george george 3849 Nov 11 15:46 submission_0.4608_GradientBoostingClassifier.csv
-rw-rw-r-- 1 george george 3790 Nov 13 10:49 submission_0.4648_GradientBoostingClassifier_engineering.csv
-rw-rw-r-- 1 george george 3798 Nov 11 19:28 submission_0.4729_ExtraTreesClassifier.csv
-rw-rw-r-- 1 george george 3835 Nov 11 15:14 submission_0.4851_XGBClassifier_vanilla.csv
-rw-rw-r-- 1 george george 3829 Nov 11 18:35 submission_0.4896_EnsembleOfAverages.csv
-rw-rw-r-- 1 george george 1800 Nov 13 08:52 submission_0.5670_SGDClassifier.csv
-rw-rw-r-- 1 george george 1800 Nov 13 18:26 submission_0.5732_cosine_similarity.csv
-rw-rw-r-- 1 george george 3784 Nov 13 12:12 submission_0.6642_AdaBoostClassifier.csv
-rw-rw-r-- 1 george george 3437 Nov 13 08:32 submission_1.1870_KNeighborsClassifier.csv
-rw-rw-r-- 1 george george 2283 Nov 11 16:51 submission_1.7907_RandomForestClassifier.csv
-rw-rw-r--