# Part 1 Null RIGs using synthetic data (the features are given)
## Specifying the user tolerance for feature significance using relative information gain (RIG)


In [1]:
import sparkbeyond._api2.classes as sb
import pandas as pd
import numpy as np
import time

# For my API token
import os

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

np.random.seed(seed=42)

In [2]:
# Import the needed class from the sampling_toolbox
from sampling_toolbox import PermutationObject
from sparkbeyond_utils import operational_log_number_of_features

## Titanic
We create synthetic data for titanic based on the number of rows of the training data and the support of the minority class in the target.

In [3]:
%%time
titanic = PermutationObject()

titanic.set_data_parameters(nrows = 720, class_weights = [0.37])

titanic.create_synthetic_data()

CPU times: user 10.8 ms, sys: 1.67 ms, total: 12.5 ms
Wall time: 11.3 ms


### Run for 1000 permutations
We will run for now and look into why 1000 was chosen later

In [4]:
%%time
titanic.calculate_null_rigs(permutations = 1000)

Max null RIG 		= 0.01392701700478339
Median null RIG 	= 0.0004783449261842156
1 in 1000 null RIG 	= 0.010355675249646657
Gain threshold 		= 0.0005
CPU times: user 14.4 s, sys: 52.2 ms, total: 14.5 s
Wall time: 14.5 s


Let's see what a RIG value equal to 0.0005 corresponds to in the null distribution.

In [5]:
titanic.null_rigs_comparison()

439 in 1000 null RIGs greater than 0.0005 threshold


We can pick a higher threshold to see how many of the rigs in the null distribution have a higher value

In [6]:
titanic.gain_threshold = 0.0087
titanic.null_rigs_comparison()

6 in 1000 null RIGs greater than 0.0087 threshold


### Run for a larger number of permutations

In [7]:
%%time
titanic.calculate_null_rigs(permutations = 10_000)

Max null RIG 		= 0.014254678194993762
Median null RIG 	= 0.0004783449261842156
1 in 10000 null RIG 	= 0.01392704977090218
Gain threshold 		= 0.0087
CPU times: user 2min 24s, sys: 508 ms, total: 2min 24s
Wall time: 2min 25s


In [8]:
titanic.null_rigs_comparison()

41 in 10000 null RIGs greater than 0.0087 threshold


Out of 10,000 rigs from the null distribution, only 41 have a RIG greater than 0.0087.

## Comparison to Titanic Features

In [9]:
api_key = os.environ['SB_Demo_API_key']
server_url = 'https://demo.sparkbeyond.com/'
client = sb.SparkBeyondClient(base_url=server_url, api_key=api_key, verify_ssl_certificate=False)

Connecting to https://demo.sparkbeyond.com/...
Connected. Server version 1.36.1


An estimate of the number of features evaluated in DP can be obtained using the log from the operational metrics tab. I ran a pipeline with the titanic data and a shuffled target to estimate the number of features evaulated for this specific dataset.

In [10]:
log = """13:25:19 - Learning to classify survived (revision #35). Evaluating using AUC
13:25:26 - Best feature (RIG: 0.0036) of 3 from sibsp is: sibsp == 5
13:25:26 - Best feature (RIG: 0.0031) of 2 from parch is: parch == 4
13:25:26 - Best feature (RIG: 0.0079) of 3 from fare is: log1p(fare) inRange (2.11 to 2.17)
13:25:26 - Best feature (RIG: 0.005) of 2 from fare is: floor(fare) notInRange (6.5 to 254.5)
13:25:26 - Best feature (RIG: 0.005) of 2 from fare is: ceil(fare) notInRange (7.5 to 255.5)
13:25:26 - Best feature (RIG: 0.0079) of 3 from fare is: inverse(fare) inRange (0.13 to 0.14)
13:25:26 - Best feature (RIG: 0.0058) of 15 from cabin is: extractKeys(elementCount(cabin)) == ('8', 'C', '6')
13:25:26 - Best feature (RIG: 0.0058) of 18 from cabin is: splitDigitsAndLetters(cabin) contains "68"
13:25:26 - Best feature (RIG: 0.0084) of 34 from cabin is: prefixes(cabin) contains "C9"
13:25:26 - Best feature (RIG: 0.0087) of 46 from ticket is: min(numbers(ticket)) inRange (34,239 to 36,716)
13:25:26 - Best feature (RIG: 0.0116) of 34 from ticket is: numericPairs(ticket) for (c.a.) >= 33111.5
13:25:26 - Best feature (RIG: 0.0116) of 33 from ticket is: numericPairsUSFormat(ticket) for (c.a.) >= 33111.5
13:25:27 - Best feature (RIG: 0.0087) of 239 from ticket is: sum(numbers(ticket)) inRange (34,239 to 36,716)
13:25:27 - Best feature (RIG: 0.0097) of 194 from name is: alphanumericRatio(name) >= 0.851
13:25:27 - Best feature (RIG: 0.0091) of 131 from name is: name contains "mr. william"
13:25:27 - Best feature (RIG: 0.0101) of 60 from name is: slidingSubsets(name) contains ("mr", "william")
13:25:27 - Best feature (RIG: 0.0073) of 69 from cabin is: The percent of 6 in (cabin) inRange (17.14 to 26.79)
13:25:27 - Best feature (RIG: 0.0077) of 77 from name is: stemmedWords(name) contains "katherin"
13:25:28 - stage end, feature generation took 1.5 seconds to this point
13:25:29 - feature generation on 8 column subsets took 1.5 seconds
13:25:30 - Enriching training set with discovered features...
13:25:31 - Enriched training set contains 269 features
13:25:34 - Building models...
13:25:38 - Model building completed. Test-set AUC score of best algorithm: 0.47"""


In [11]:
operational_log_number_of_features(log)

965

As by definition these are all null features, from the estimation above, we would expect around 4 in 1000 null features to have a RIG greater than 0.0087. 

In [12]:
model = client.revision(project_name = 'Titanic_-_Survival_Prediction_peter', revision_id=35)
df_features = model.features()
df_features[df_features['RIG'] >= 0.0087]

Unnamed: 0,idx,feature,Input names,Dominant survived,RIG,Score,lin. score,Support 0,Support 1,% support 0,...,Median,75th Percentile,Max,Mean,SD,Estimated Total Values,Summary is Sampled,numericFeatureName,booleanFeatureName,revision_id
0,0,numericPairs(ticket) for (c.a.) >= 33111.5,ticket,1,0.011639,0.011509,0.623454,0.0,6.0,0.0%,...,18723.0,31921.0,37671.0,17292.0,14648.0,,False,numericPairs(ticket) for (c.a.),numericPairs(ticket) for (c.a.) >= 33111.5,35
1,1,alphanumericRatio(name) >= 0.851,name,1,0.009688,0.009616,0.538697,0.0,5.0,0.0%,...,9.223372e-11,9.223372e-11,9.223372e-11,9.223372e-11,9.223372e-11,,False,alphanumericRatio(name),alphanumericRatio(name) >= 0.851,35
2,2,"slidingSubsets(name) contains (""mr"", ""william"")",name,0,0.010055,0.009613,0.535619,28.0,5.0,6.42%,...,,,,,,7567.0,False,"slidingSubsets(name) contains (""mr"", ""william"")","slidingSubsets(name) contains (""mr"", ""william"")",35
18,18,"sum(numbers(ticket)) inRange (34,239 to 36,716)",ticket,1,0.008748,0.005807,0.625098,2.0,9.0,0.46%,...,112379.0,347082.0,3101317.0,295130.0,651826.0,,False,"|sum(numbers(ticket)) - 35,477.5|","sum(numbers(ticket)) inRange (34,239 to 36,716)",35


As a second check we can run a pipeline with the same data but with Simple by RIG features selection and a gain threshold of 0.0087

In [13]:
model = client.revision(project_name = 'Titanic_-_Survival_Prediction_peter', revision_id=37)
model.learning_settings()

LearningSettings(
    problem_definition=ProblemDefinition(target_column=ColumnParam(value='survived')),
    feature_generator_settings=FeatureGenerationSettings(
        gain_threshold=0.0087,
        feature_selection_method=SimpleByRigSelection()
    )
)

Setting the gain threshold to 0.0087, using Simple by RIG feature selection and running learn returns the following.

In [14]:
df_features = model.features()
df_features

Unnamed: 0,idx,feature,Input names,Dominant survived,RIG,Score,lin. score,Support 0,Support 1,% support 0,...,Median,75th Percentile,Max,Mean,SD,Estimated Total Values,Summary is Sampled,numericFeatureName,booleanFeatureName,revision_id
0,0,numericPairs(ticket) for (c.a.) >= 33111.5,ticket,1,0.011639,0.011639,0.623454,0.0,6.0,0.0%,...,18723.0,31921.0,37671.0,17292.0,14648.0,,False,numericPairs(ticket) for (c.a.),numericPairs(ticket) for (c.a.) >= 33111.5,37
1,1,"slidingSubsets(name) contains (""mr"", ""william"")",name,0,0.010055,0.010055,0.535619,28.0,5.0,6.42%,...,,,,,,7567.0,False,"slidingSubsets(name) contains (""mr"", ""william"")","slidingSubsets(name) contains (""mr"", ""william"")",37
2,2,alphanumericRatio(name) >= 0.851,name,1,0.009688,0.009688,0.538697,0.0,5.0,0.0%,...,9.223372e-11,9.223372e-11,9.223372e-11,9.223372e-11,9.223372e-11,,False,alphanumericRatio(name),alphanumericRatio(name) >= 0.851,37
3,3,"name contains ""mr. william""",name,0,0.009127,0.009127,0.512968,21.0,3.0,4.82%,...,,,,,,4392.0,False,"name contains ""mr. william""","name contains ""mr. william""",37
4,4,"sum(numbers(ticket)) inRange (34,239 to 36,716)",ticket,1,0.008748,0.008748,0.625098,2.0,9.0,0.46%,...,112379.0,347082.0,3101317.0,295130.0,651826.0,,False,"|sum(numbers(ticket)) - 35,477.5|","sum(numbers(ticket)) inRange (34,239 to 36,716)",37


We see 5 features with a RIG higher than 0.0087, in agreement with our estimation (4-5 features) from the permutation test.