# How Many Events Are Good Enough?
The number of events directly affects the length of training. This notebook will explore a bit how the performance varies as the number of events.

## Initalization

In [80]:
from bdt_training_scikit_tools import plot_training_performance, load_default_samples, test_train_samples, prep_samples, default_training, calc_performance, calc_performance_for_run
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

## Load Data Samples
Load all the data we need.

In [76]:
def fraction (fractionGoal):
    '''Return a function that will take from a randomly distributed number a fraction of events.
    Meant to be used against an EventNumber to evenly pair down the number of events.
    
    Args
        fractionGoal - the fraction of events we'd like to see
        
    Returns
        func - A function that will return True or False when given an event number. True, then keep the event.
    '''
    if (fractionGoal < 0) | (fractionGoal > 1.0):
        raise Exception("Fraction must be between 0.0 and 1.0 (not {0})".format(fractionGoal))
        
    seq = ()
    fg = fractionGoal
    maxCount = 300
    for i in range(1, maxCount):
        frac = 1.0/i
        if frac <= fg:
            seq = seq + (i,)
            remainingSequence = [i for i in range(maxCount) if len([j for j in seq if i%j == 0]) != 0]
            actualFraction = len(remainingSequence)/maxCount
            fg = fractionGoal - actualFraction

    return seq
    #return lambda x: len([i for i in seq if x%i == 0]) != 0
    #return lambda x: x%i == 0

In [63]:
all_events = load_default_samples("20")

BIB: 100000 events
Multijet: 100000 events
Signal: 100000 events


In [99]:
def calcDFFilter (df, seq):
    gf, *gfRest = [df.EventNumber%i!=0 for i in seq]
    for g in gfRest:
        gf = gf & g
    return gf

def get_fraction_of_events(fractionToUse):
    '''Return a fraction of all events as training and testing samples.
    
    Args
        fractionToUse - fraction of the full datasample we should be using
        
    Returns
        training - Training tripple of events
        testing - Testing tripple of events
    
    '''
    seq = fraction(fractionToUse)
    fracFilters = [calcDFFilter(df, seq) for df in all_events]

    fraction_events = [dfi[1][dfi[0]] for dfi in zip(fracFilters,all_events)]
    #fracFunc = fraction(fractionToUse)
    #fraction_events = [df[fracFunc(df.EventNumber)] for df in all_events]
    return test_train_samples(fraction_events)

In [102]:
r = get_fraction_of_events(0.5)
#all_events[0]

TypeError: 'function' object is not iterable

In [84]:
pd.Series([]).any()

False