# Part 1 Null RIGs using synthetic data (the features are given)
## Estimating the number of overfit features for a given pipeline
* Can include the p-value confidence intervals
* Treat empirically derived features as if they were pre-specificed 

In [1]:
import sparkbeyond._api2.classes as sb
import pandas as pd
import numpy as np
import time

# For my API token
import os

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

np.random.seed(seed=42)

  from cryptography.utils import int_from_bytes, int_to_bytes


In [2]:
test = sb.WorldKnowledge(open_street_map=True)

In [2]:
# Import the needed class from the sampling_toolbox
from sampling_toolbox import PermutationObject, operational_log_number_of_features

## Titanic
We create synthetic data for titanic based on the number of rows of the training data and the support of the minority class in the target.

In [3]:
%%time
titanic = PermutationObject()

titanic.set_data_parameters(nrows = 720, minority_class = 0.37)

titanic.create_synthetic_data()

Wall time: 13.8 ms


### Run for 1000 permutations
We will run for now and look into why 1000 was chosen later

In [4]:
%%time
titanic.calculate_null_rigs(permutations = 1000)

Max null RIG 		= 0.012108454756697327
Median null RIG 	= 0.0004783449261842156
1 in 1000 null RIG 	= 0.00876674555343226
Gain threshold 		= 0.0005
Wall time: 24.3 s


In [5]:
titanic.null_rigs_comparison()

441 in 1000 null RIGs greater than 0.0005 threshold


In [6]:
titanic.gain_threshold = 0.01
titanic.null_rigs_comparison()

1 in 1000 null RIGs greater than 0.01 threshold


### Run for a larger number of permutations

In [7]:
%%time
titanic.calculate_null_rigs(permutations = 10_000)

Max null RIG 		= 0.018037440508354128
Median null RIG 	= 0.0004783449261842156
1 in 10000 null RIG 	= 0.016612823855212518
Gain threshold 		= 0.01
Wall time: 3min 9s


In [8]:
titanic.null_rigs_comparison()

31 in 10000 null RIGs greater than 0.01 threshold


In [9]:
titanic.gain_threshold = 0.02
titanic.null_rigs_comparison()

0 in 10000


## Comparison to Titanic Features

In [10]:
#api_key = os.environ['SB_Demo_API_key']
api_key = 'eyJhbGciOiJIUzI1NiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICI3YzE2YmVjYy1lZjQ0LTQ2NjYtOGI4Ni0xMDVmZWQ3ZGVlOTkifQ.eyJpYXQiOjE2Mjg2NDY3MDAsImp0aSI6ImYxZmRiNjUxLTAxZDAtNDM5Ny05NTgyLTA3MGEyOTZlMjliNSIsImlzcyI6Imh0dHBzOi8vZGVtby5zcGFya2JleW9uZC5jb20vYXV0aC9yZWFsbXMvc3BhcmtiZXlvbmQiLCJhdWQiOiJodHRwczovL2RlbW8uc3BhcmtiZXlvbmQuY29tL2F1dGgvcmVhbG1zL3NwYXJrYmV5b25kIiwic3ViIjoiMjYyMzc0NzEtYjIyZS00YzUxLWFiZjItZDBkOGI2NGI3YzFiIiwidHlwIjoiT2ZmbGluZSIsImF6cCI6ImRpc2NvdmVyeS1zZGsiLCJzZXNzaW9uX3N0YXRlIjoiY2Q4YTE0NzMtZTg1Yy00ODIwLWExODMtNDdjNDVjNjQ5OTI1Iiwic2NvcGUiOiJwcm9maWxlIGVtYWlsIG9mZmxpbmVfYWNjZXNzIn0.fDbAw28B5i31UObUoBmDdgOQcQsop4PZOvPSZMcNUSE'
server_url = 'https://demo.sparkbeyond.com/'
client = sb.SparkBeyondClient(base_url=server_url, api_key=api_key, verify_ssl_certificate=False)

Connecting to https://demo.sparkbeyond.com/...
The clock of the server differs from this machine by 18058 seconds. The server thinks the current time is Sun Jan 30 10:34:09 2022, while this client machine thinks it's Sun Jan 30 15:35:07 2022. This difference can lead to authentication errors. Please correct the clock of the server or client machines, whichever is wrong.
Connected. Server version 1.36.1


In a similar way to calculating null RIGs, I ran a pipeline with a shuffled target to observe features generated by Learn for a single null RIG permutation. By definition the features generated should be "null features" that include the process of feature search and feature ranking. 

Then using a gain threshold set to the 1/1000 null RIG from before, we can see how many features have RIG values above this threshold.

In [11]:
model = client.revision(project_name = 'Titanic_-_Survival_Prediction_peter', revision_id=30)
model.learning_settings()

LearningSettings(
    problem_definition=ProblemDefinition(target_column=ColumnParam(value='survived')),
    feature_generator_settings=FeatureGenerationSettings(gain_threshold=0.0084),
    feature_count=[1000]
)

In [12]:
df_features = model.features()
df_features

Unnamed: 0,idx,feature,Input names,Dominant survived,RIG,Score,lin. score,Support 0,Support 1,% support 0,...,Median,75th Percentile,Max,Mean,SD,Estimated Total Values,Summary is Sampled,numericFeatureName,booleanFeatureName,revision_id
0,0,numericPairs(ticket) for (c.a.) >= 33111.5,ticket,1,0.011639,0.011509,0.623454,0.0,6.0,0.0%,...,18723.0,31921.0,37671.0,17292.0,14648.0,,False,numericPairs(ticket) for (c.a.),numericPairs(ticket) for (c.a.) >= 33111.5,30
1,1,alphanumericRatio(name) >= 0.851,name,1,0.009688,0.009616,0.538697,0.0,5.0,0.0%,...,9.223372e-11,9.223372e-11,9.223372e-11,9.223372e-11,9.223372e-11,,False,alphanumericRatio(name),alphanumericRatio(name) >= 0.851,30
2,2,"slidingSubsets(name) contains (""mr"", ""william"")",name,0,0.010055,0.009613,0.535619,28.0,5.0,6.42%,...,,,,,,7567.0,False,"slidingSubsets(name) contains (""mr"", ""william"")","slidingSubsets(name) contains (""mr"", ""william"")",30


### Estimating the number of features evaluated
I estimated the number of features generated using the operational log generated from learn for a pipeline run on the original data

In [13]:
log = """19:29:27 - Learning to classify survived (revision #25). Evaluating using AUC
19:29:32 - Best feature (RIG: 0.0031) of 2 from parch is: parch == 4
19:29:32 - Best feature (RIG: 0.0036) of 3 from sibsp is: sibsp == 5
19:29:32 - Best feature (RIG: 0.005) of 2 from fare is: ceil(fare) notInRange (7.5 to 255.5)
19:29:32 - Best feature (RIG: 0.0079) of 3 from fare is: inverse(fare) inRange (0.13 to 0.14)
19:29:32 - Best feature (RIG: 0.0079) of 3 from fare is: log1p(fare) inRange (2.11 to 2.17)
19:29:32 - Best feature (RIG: 0.005) of 2 from fare is: floor(fare) notInRange (6.5 to 254.5)
19:29:32 - Best feature (RIG: 0.0058) of 15 from cabin is: extractKeys(elementCount(cabin)) == ('8', 'C', '6')
19:29:32 - Best feature (RIG: 0.0058) of 18 from cabin is: splitDigitsAndLetters(cabin) contains "68"
19:29:32 - Best feature (RIG: 0.0116) of 34 from ticket is: numericPairs(ticket) for (c.a.) >= 33111.5
19:29:32 - Best feature (RIG: 0.0087) of 46 from ticket is: min(numbers(ticket)) inRange (34,239 to 36,716)
19:29:32 - Best feature (RIG: 0.0084) of 34 from cabin is: prefixes(cabin) contains "C9"
19:29:32 - Best feature (RIG: 0.0116) of 33 from ticket is: numericPairsUSFormat(ticket) for (c.a.) >= 33111.5
19:29:32 - Best feature (RIG: 0.0087) of 239 from ticket is: sum(numbers(ticket)) inRange (34,239 to 36,716)
19:29:32 - Best feature (RIG: 0.0097) of 194 from name is: alphanumericRatio(name) >= 0.851
19:29:32 - Best feature (RIG: 0.0101) of 60 from name is: slidingSubsets(name) contains ("mr", "william")
19:29:32 - Best feature (RIG: 0.0073) of 69 from cabin is: The percent of 6 in (cabin) inRange (17.14 to 26.79)
19:29:32 - Best feature (RIG: 0.0077) of 77 from name is: stemmedWords(name) contains "katherin"
19:29:32 - Best feature (RIG: 0.0091) of 131 from name is: name contains "mr. william"""


In [14]:
operational_log_number_of_features(log)

965