# Part 1 Null RIGs using synthetic data (the features are given)
## Estimating the number of overfit features for a given pipeline
* Can include the p-value confidence intervals
* Treat empirically derived features as if they were pre-specificed 

In [1]:
import sparkbeyond._api2.classes as sb
import pandas as pd
import numpy as np
import time

# For my API token
import os

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

np.random.seed(seed=42)

In [2]:
# Import the needed class from the sampling_toolbox
from sampling_toolbox import PermutationObject, operational_log_number_of_features

## Titanic
We create synthetic data for titanic based on the number of rows of the training data and the support of the minority class in the target.

In [3]:
%%time
titanic = PermutationObject()

titanic.set_data_parameters(nrows = 720, minority_class = 0.37)

titanic.create_synthetic_data()

CPU times: user 15.3 ms, sys: 5.55 ms, total: 20.9 ms
Wall time: 29 ms


### Run for 1000 permutations
We will run for now and look into why 1000 was chosen later

In [4]:
%%time
titanic.calculate_null_rigs(permutations = 1000)

Max null RIG 		= 0.012108454756697327
Median null RIG 	= 0.0004783449261842156
1 in 1000 null RIG 	= 0.00876674555343226
Gain threshold 		= 0.0005
CPU times: user 12.3 s, sys: 63.7 ms, total: 12.3 s
Wall time: 12.4 s


In [5]:
titanic.null_rigs_comparison()

441 in 1000 null RIGs greater than 0.0005 threshold


In [6]:
titanic.gain_threshold = 0.01
titanic.null_rigs_comparison()

1 in 1000 null RIGs greater than 0.01 threshold


### Run for a larger number of permutations

In [7]:
%%time
titanic.calculate_null_rigs(permutations = 10_000)

Max null RIG 		= 0.018037440508354128
Median null RIG 	= 0.0004783449261842156
1 in 10000 null RIG 	= 0.016612823855212518
Gain threshold 		= 0.01
CPU times: user 1min 58s, sys: 706 ms, total: 1min 58s
Wall time: 1min 59s


In [8]:
titanic.null_rigs_comparison()

31 in 10000 null RIGs greater than 0.01 threshold


In [9]:
titanic.gain_threshold = 0.02
titanic.null_rigs_comparison()

0 in 10000


## Comparison to Titanic Features

In [10]:
#api_key = os.environ['SB_Demo_API_key']
api_key = 'eyJhbGciOiJIUzI1NiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICI3YzE2YmVjYy1lZjQ0LTQ2NjYtOGI4Ni0xMDVmZWQ3ZGVlOTkifQ.eyJpYXQiOjE2NDc4OTIxMTAsImp0aSI6ImI4YzEyMTA0LThhMDUtNDJlYi04NDNlLWZjMWRhMzkwODU3MCIsImlzcyI6Imh0dHBzOi8vZGVtby5zcGFya2JleW9uZC5jb20vYXV0aC9yZWFsbXMvc3BhcmtiZXlvbmQiLCJhdWQiOiJodHRwczovL2RlbW8uc3BhcmtiZXlvbmQuY29tL2F1dGgvcmVhbG1zL3NwYXJrYmV5b25kIiwic3ViIjoiMjYyMzc0NzEtYjIyZS00YzUxLWFiZjItZDBkOGI2NGI3YzFiIiwidHlwIjoiT2ZmbGluZSIsImF6cCI6ImRpc2NvdmVyeS1zZGsiLCJzZXNzaW9uX3N0YXRlIjoiZDA1ODk0M2UtNmRmZS00ODFhLWEwZDItMmQ1ODc3ZTcyYTM0Iiwic2NvcGUiOiJwcm9maWxlIGVtYWlsIG9mZmxpbmVfYWNjZXNzIn0.b-GO6G-fR69jBcj9PGK74gcDqeM5nueAv0fviQ7RVFg'
server_url = 'https://demo.sparkbeyond.com/'
client = sb.SparkBeyondClient(base_url=server_url, api_key=api_key, verify_ssl_certificate=False)

Connecting to https://demo.sparkbeyond.com/...
Connected. Server version 1.36.1


In a similar way to calculating null RIGs, I ran a pipeline with a shuffled target to observe features generated by Learn for a single null RIG permutation. By definition the features generated should be "null features" that include the process of feature search and feature ranking. 

Then using a gain threshold set to the 1/1000 null RIG from before, we can see how many features have RIG values above this threshold.

In [11]:
model = client.revision(project_name = 'Titanic_-_Survival_Prediction_peter', revision_id=30)
model.learning_settings()

LearningSettings(
    problem_definition=ProblemDefinition(target_column=ColumnParam(value='survived')),
    feature_generator_settings=FeatureGenerationSettings(gain_threshold=0.0084),
    feature_count=[1000]
)

In [12]:
df_features = model.features()
df_features

Unnamed: 0,idx,feature,Input names,Dominant survived,RIG,Score,lin. score,Support 0,Support 1,% support 0,...,Median,75th Percentile,Max,Mean,SD,Estimated Total Values,Summary is Sampled,numericFeatureName,booleanFeatureName,revision_id
0,0,numericPairs(ticket) for (c.a.) >= 33111.5,ticket,1,0.011639,0.011509,0.623454,0.0,6.0,0.0%,...,18723.0,31921.0,37671.0,17292.0,14648.0,,False,numericPairs(ticket) for (c.a.),numericPairs(ticket) for (c.a.) >= 33111.5,30
1,1,alphanumericRatio(name) >= 0.851,name,1,0.009688,0.009616,0.538697,0.0,5.0,0.0%,...,9.223372e-11,9.223372e-11,9.223372e-11,9.223372e-11,9.223372e-11,,False,alphanumericRatio(name),alphanumericRatio(name) >= 0.851,30
2,2,"slidingSubsets(name) contains (""mr"", ""william"")",name,0,0.010055,0.009613,0.535619,28.0,5.0,6.42%,...,,,,,,7567.0,False,"slidingSubsets(name) contains (""mr"", ""william"")","slidingSubsets(name) contains (""mr"", ""william"")",30


### Estimating the number of features evaluated
I estimated the number of features generated using the operational log generated from learn for a pipeline run on the original data

In [13]:
log = """19:29:27 - Learning to classify survived (revision #25). Evaluating using AUC
19:29:32 - Best feature (RIG: 0.0031) of 2 from parch is: parch == 4
19:29:32 - Best feature (RIG: 0.0036) of 3 from sibsp is: sibsp == 5
19:29:32 - Best feature (RIG: 0.005) of 2 from fare is: ceil(fare) notInRange (7.5 to 255.5)
19:29:32 - Best feature (RIG: 0.0079) of 3 from fare is: inverse(fare) inRange (0.13 to 0.14)
19:29:32 - Best feature (RIG: 0.0079) of 3 from fare is: log1p(fare) inRange (2.11 to 2.17)
19:29:32 - Best feature (RIG: 0.005) of 2 from fare is: floor(fare) notInRange (6.5 to 254.5)
19:29:32 - Best feature (RIG: 0.0058) of 15 from cabin is: extractKeys(elementCount(cabin)) == ('8', 'C', '6')
19:29:32 - Best feature (RIG: 0.0058) of 18 from cabin is: splitDigitsAndLetters(cabin) contains "68"
19:29:32 - Best feature (RIG: 0.0116) of 34 from ticket is: numericPairs(ticket) for (c.a.) >= 33111.5
19:29:32 - Best feature (RIG: 0.0087) of 46 from ticket is: min(numbers(ticket)) inRange (34,239 to 36,716)
19:29:32 - Best feature (RIG: 0.0084) of 34 from cabin is: prefixes(cabin) contains "C9"
19:29:32 - Best feature (RIG: 0.0116) of 33 from ticket is: numericPairsUSFormat(ticket) for (c.a.) >= 33111.5
19:29:32 - Best feature (RIG: 0.0087) of 239 from ticket is: sum(numbers(ticket)) inRange (34,239 to 36,716)
19:29:32 - Best feature (RIG: 0.0097) of 194 from name is: alphanumericRatio(name) >= 0.851
19:29:32 - Best feature (RIG: 0.0101) of 60 from name is: slidingSubsets(name) contains ("mr", "william")
19:29:32 - Best feature (RIG: 0.0073) of 69 from cabin is: The percent of 6 in (cabin) inRange (17.14 to 26.79)
19:29:32 - Best feature (RIG: 0.0077) of 77 from name is: stemmedWords(name) contains "katherin"
19:29:32 - Best feature (RIG: 0.0091) of 131 from name is: name contains "mr. william"""


In [14]:
operational_log_number_of_features(log)

965

In [16]:
titanic.data


Unnamed: 0,target,feature_0.1
0,0,0
1,1,0
2,0,0
3,0,0
4,0,0
...,...,...
715,0,0
716,1,1
717,0,0
718,1,0


In [17]:
def entropy(x: float):
    """Function for returning entropy"""
    if (x == 0) | (x == 1):
        return 0
    else:
        h = (-x*np.log2(x)) - ((1-x)*np.log2(1-x))
        return h

In [26]:
titanic.data['target'].mean() , entropy(titanic.data['target'].mean())


(0.36944444444444446, 0.9502445670610749)

In [22]:
values = titanic.data['target'].value_counts(normalize=True)

In [52]:
sum(h)

0.9502445670610749

In [53]:
df = titanic.data.copy()
df.head()
target = 'target'
feature = 'feature_0.1'

In [72]:
counts = pd.crosstab(df[feature], df[target], normalize='index')
counts

target,0,1
feature_0.1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.628086,0.371914
1,0.652778,0.347222


In [70]:
%%time
s_entropy = counts.loc[:, 1].apply(entropy)
s_entropy

CPU times: user 643 µs, sys: 27 µs, total: 670 µs
Wall time: 677 µs


feature_0.1
0    0.952130
1    0.931563
Name: 1, dtype: float64

In [71]:
%%time
s_entropy = counts.apply(entropy_new, axis=1)
s_entropy

CPU times: user 1.27 ms, sys: 48 µs, total: 1.32 ms
Wall time: 1.33 ms


feature_0.1
0    0.952130
1    0.931563
dtype: float64

In [61]:
%%time
s_entropy = pd.crosstab(df[feature], df[target], normalize='index').loc[:, 1].apply(entropy)
s_entropy

CPU times: user 14.6 ms, sys: 1.7 ms, total: 16.3 ms
Wall time: 15 ms


feature_0.1
0    0.952130
1    0.931563
Name: 1, dtype: float64

In [None]:
h_prior = entropy(df[target].mean())

# Probability feature is true
probability_feature_true = df[feature].mean()

# Entropy of target given feature
s_entropy = pd.crosstab(df[feature], df[target], normalize='index').loc[:, 1].apply(entropy)

# If the feature only has one outcome
if len(s_entropy) < 2:
    if s_entropy.index[0] == 0:
        h_feature_f = s_entropy[0]*(1-probability_feature_true)
        h_feature_t = 0
    else:
        h_feature_f = 0
        h_feature_t = s_entropy[1]*probability_feature_true

else:
    h_feature_t = s_entropy[1]*probability_feature_true
    h_feature_f = s_entropy[0]*(1-probability_feature_true)

return (h_prior - (h_feature_t + h_feature_f))/h_prior

In [75]:
# Target column
target = 'price'

# Previously run pipeline
proj_name = 'House_Sales_in_King_County_-_Regression_with_contexts_peter'
revision = 16

# File on which feature search has been run with a explicit partition column (`splitColumn`)
filepath = '../../../Data/KingCounty/KingCountyHouseSales_with_random_parition_column.csv.gz'

# Number of bootstrap resamples
resamples = 10

bins = []

In [76]:
proj = client.revision(proj_name, revision_id=revision)
df = pd.read_csv(filepath)

In [77]:
def get_pipeline_files(df,proj,target):
    
    # Get the features
    df_features = proj.features()
    number_of_features = df_features.shape[0]
    
    ######## NEED TO CHANGE THIS FOR EACH PROJECT IF NON-DEFAULT VALUES ARE USED #########
    
    # Change the default settings to match those from the UI
    train = client.upload_dataframe(df[df.splitColumn == 'Train'], 
                                    project_name = proj.project_name,
                                    target_path='train.tsv.gz',
                                    overwrite = True)
    train.source.settings.format.use_escaping = True

    test = client.upload_dataframe(df[df.splitColumn == 'Test'], 
                                   project_name = proj.project_name,
                                   target_path = 'test.tsv.gz', 
                                   overwrite = True)
    test.source.settings.format.use_escaping = True

    ######################################################################################
    # Enriched the Train data
    
    Enrich_params = sb.EnrichParams(inputs = [train], 
                                    include_originals = True,
                                    enforce_boolean_numeric = True)
    
    df_enriched_train = proj.enrich(Enrich_params).results_dataframe()

    # Enrich the Test data
    Enrich_params = sb.EnrichParams(inputs = [test], 
                                    include_originals = True,
                                    enforce_boolean_numeric = True)
    
    df_enriched_test = proj.enrich(Enrich_params).results_dataframe()

    # Clean the enriched data to get just the features
    df_train_clean = df_enriched_train[df_enriched_train.columns[-number_of_features:].tolist() + [target]]
    df_test_clean = df_enriched_test[df_enriched_test.columns[-number_of_features:].tolist() + [target]]
    
    return {'df_features' : df_features, 'df_train_clean': df_train_clean, 'df_test_clean' : df_test_clean}


In [78]:
# Enrich the data and get the required dataframes
pipeline_dict = get_pipeline_files(df,proj,target)

HBox(children=(IntProgress(value=0, description='Hashing dataframe contents', max=17252, style=ProgressStyle(d…


Target filename with dataframe hash is train-c4248339c0d5dce86bd69ff5f392ec26.tsv.gz
Writing dataframe to temp file /var/folders/z4/ymtx_b595ngcqjv1bwwv9qjh0000gn/T/trainkknfnmn2.tsv.gz


HBox(children=(IntProgress(value=0, description='Uploading', max=492481, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Hashing dataframe contents', max=4358, style=ProgressStyle(de…


Target filename with dataframe hash is test-6bbd5d8cbf2678a1397a639f2bd89ac8.tsv.gz
Writing dataframe to temp file /var/folders/z4/ymtx_b595ngcqjv1bwwv9qjh0000gn/T/testlp1tkz8c.tsv.gz


HBox(children=(IntProgress(value=0, description='Uploading', max=127387, style=ProgressStyle(description_width…


Job running, started at 2022-03-23 18:39:43.461000
Finished building new contexts


HBox(children=(IntProgress(value=1, bar_style='info', description='Rows processed', max=1, style=ProgressStyle…


Job completed: EnrichPredictJobResult(report_filenames=[])


HBox(children=(IntProgress(value=1, bar_style='info', description='Downloading to /var/folders/z4/ymtx_b595ngc…


Job running, started at 2022-03-23 18:40:19.721000
Finished building new contexts


HBox(children=(IntProgress(value=1, bar_style='info', description='Rows processed', max=1, style=ProgressStyle…


Job completed: EnrichPredictJobResult(report_filenames=[])


HBox(children=(IntProgress(value=1, bar_style='info', description='Downloading to /var/folders/z4/ymtx_b595ngc…




In [79]:
# Make copies of the dataframes
df_train_clean = pipeline_dict['df_train_clean'].copy()
df_test_clean = pipeline_dict['df_test_clean'].copy()
df_features = pipeline_dict['df_features'].copy()

In [81]:
import re

def get_regression_bins(df: pd.core.frame.DataFrame):
    '''Function to get the regression bins from a DP feature dataframe
    
    Inputs
    ----------
    df    - SparkBeyond features dataframe (client.revision().features())
    
    Returns
    ---------
    bins  - A list of the bin values '''
    
    # Extract the lift columns
    columns = df.columns[df.columns.str.contains('Lift')]
    
    # Take the first number from each column
    bins = [int(re.search(pattern="\d+", string=column).group()) for column in columns
                                 if re.search(pattern="\d+", string=column)]

    # Append the last number from the last column
    bins.append(int(re.search(string = columns[-1], pattern = "\d+$").group()))
    
    return bins

In [82]:
feature =  df_train_clean.columns[0]
print(feature)

binary_target = df[target].nunique() == 2

if binary_target:
    # check RIG calculation for first feature (slight discrepancy - will need to check)
    Y_train = df_train_clean[target]
    Y_test = df_test_clean[target]

else:
    bins = get_regression_bins(df_features)
    print(bins)
    Y_train = pd.cut(df_train_clean[target], bins = bins)
    Y_test = pd.cut(df_test_clean[target], bins = bins)


latitude(latLong) < 47.531
[75000, 278000, 360000, 450000, 565500, 760000, 5570000]


In [83]:
mask = df_features.feature == feature
print('SB Train RIG \t\t= {}'.format(df_features.loc[mask, 'RIG'].values[0]))

SB Train RIG 		= 0.11051776976058203


In [85]:
df_train_clean[feature].head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: latitude(latLong) < 47.531, dtype: float64

In [86]:
Y_train.head()

0    (760000, 5570000]
1    (760000, 5570000]
2    (760000, 5570000]
3    (760000, 5570000]
4    (760000, 5570000]
Name: price, dtype: category
Categories (6, interval[int64]): [(75000, 278000] < (278000, 360000] < (360000, 450000] < (450000, 565500] < (565500, 760000] < (760000, 5570000]]

In [124]:

def entropy_new(values):
    h = []
    for value in values:
        if (value == 0) | (value == 1):
            h.append(0)
        else:
            h.append(-value*np.log2(value))
    return sum(h)

def rig(x, y):
    # Prior Entropy
    prior_counts = pd.value_counts(y, normalize=True).to_frame()
    pe = prior_counts.apply(entropy_new).values[0]

    counts = pd.crosstab(x, y, normalize='index')
    f_weights = pd.value_counts(x, normalize=True)

    f_entropy = counts.apply(entropy_new, axis=1)

    # Conditional entropy
    ce = sum(f_weights*f_entropy)

    RIG = (pe-ce)/pe
    return RIG

In [126]:
%%time
rig(x = df_train_clean[feature], y = Y_train)

CPU times: user 37.9 ms, sys: 3.98 ms, total: 41.8 ms
Wall time: 39 ms


0.1104997505700234

In [127]:
df = titanic.data.copy()
df.head()
target = 'target'
feature = 'feature_0.1'

In [129]:
%%time
rig(df[feature], df[target])

CPU times: user 20 ms, sys: 2.72 ms, total: 22.7 ms
Wall time: 20.4 ms


0.00018019587939709127

In [130]:
print('Calculated Train RIG \t= {}'.format(str(rig(df_train_clean[feature],Y_train))))
print('Calculated Test RIG \t= {}'.format(str(rig(df_test_clean[feature], Y_test))))

KeyError: 'feature_0.1'