In [1]:
%load_ext autoreload
%autoreload 2 

In [2]:
import numpy as np
import pandas as pd
import re 
import pytest
import sys
import re
np.set_printoptions(suppress=True)
pd.options.display.float_format = "{:,.2f}".format
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
import best_subset as bs_pkgs


In [331]:

def process_exaustive_and_return_top(df, features, return_top=100):
    mask = df['Models'].notna()  # Ensure we exclude NaN values    
    for feature in features:
        mask &= df['Models'].str.contains(rf'\b{feature}\b', regex=True, na=False)
    top =  df[mask] 

    top['rank'] = top.groupby("Var Number")['Scores'].rank(ascending=False)
    top = top[top['rank']<=100]
    top.drop('rank', axis=1, inplace=True)
    top = top.reset_index(drop=True)
    return top


def check_if_features_in(df, features):
    mask = df['Models'].notna()  # Ensure we exclude NaN values    
    for feature in features:
        mask &= df['Models'].str.contains(rf'\b{feature}\b', regex=True, na=False)
    return df[mask]

def compare_dataframes(df1: pd.DataFrame, df2: pd.DataFrame):
    """Compares two pandas DataFrames, rounding floating-point columns to 2 decimal places."""
    float_cols = df1.select_dtypes(include=['float']).columns
    df1_rounded = df1.copy()
    df2_rounded = df2.copy()
    df1_rounded[float_cols] = df1_rounded[float_cols].round(2)
    df2_rounded[float_cols] = df2_rounded[float_cols].round(2)
    pd.testing.assert_frame_equal(df1_rounded, df2_rounded, check_dtype=False)
    print("Dataset Match")

def order_models_field(df):
    df['Models'] = df['Models'].apply(
        lambda model: " ".join(
            sorted(model.split(" "), key=lambda x: int(re.search(r'\d+', x).group()))
        )
    )
    df = df.reset_index(drop=True)
    return df 

def order_models_filed_all(df):
    df['Models'] = df['Models'].apply(lambda model: " ".join(sorted(model.split(" "))) )
    return df


def create_synthetic_data(seed=42, n=50000, p=15):
    """
    Creates a DataFrame X of shape (n, p+1) with columns:
      - 'const': all ones (intercept)
      - 'x1', 'x2', ... 'x15'
    And a Series y with binary (0/1) outcomes drawn from a logistic model.
    
    Some of the 15 features have nonzero coefficients, others are zero,
    so there's meaningful signal to detect in a logistic regression.
    """

    np.random.seed(seed)

    # 1) Generate random features ~ N(0,1)
    X_base = np.random.randn(n, p)
    
    # 2) Define "true" coefficients
    #    For instance, let's say 5 features matter:
    #    x1, x2, x3, x4, x5 have some nonzero betas.
    #    The remaining x6..x15 have 0 effect.
    betas_true = np.array([1.5, -2.0, 0.75, 1.25, -0.5] + [0]*(p-5))
    #     -> length = 15
    
    # 3) Linear predictor: X_base dot betas_true
    #    shape -> (n, )
    lin_pred = X_base.dot(betas_true)
    
    # 4) Convert linear predictor to probability via logistic function
    #    p_i = 1 / (1 + exp(-lin_pred))
    prob = 1.0 / (1.0 + np.exp(-lin_pred))
    
    # 5) Draw binary outcomes y from Bernoulli(prob)
    y_vals = np.random.binomial(1, prob)
    
    # 6) Create a DataFrame for features, plus an intercept column
    df = pd.DataFrame(X_base, columns=[f"x{i+1}" for i in range(p)])
    df.insert(0, "const", 1.0)  # intercept
    
    # 7) Create a Series for y
    y = pd.Series(y_vals, name="y")
    
    return df, y


def create_synthetic_data_logistic(seed=42, n=50000, p=15):
    """
    Creates a DataFrame X of shape (n, p+2) with columns:
      - 'const': all ones (intercept)
      - 'x1', 'x2', ... 'x15'
      - 'weight': randomly generated weights between 0 and 100
    And a Series y with binary (0/1) outcomes drawn from a logistic model.
    
    Some of the 15 features have nonzero coefficients, others are zero,
    so there's meaningful signal to detect in a logistic regression.
    """

    np.random.seed(seed)

    # 1) Generate random features ~ N(0,1)
    X_base = np.random.randn(n, p)
    
    # 2) Define "true" coefficients
    #    For instance, let's say 5 features matter:
    #    x1, x2, x3, x4, x5 have some nonzero betas.
    #    The remaining x6..x15 have 0 effect.
    betas_true = np.array([1.5, -2.0, 0.75, 1.25, -0.5] + [0]*(p-5))
    #     -> length = 15
    
    # 3) Linear predictor: X_base dot betas_true
    #    shape -> (n, )
    lin_pred = X_base.dot(betas_true)
    
    # 4) Convert linear predictor to probability via logistic function
    #    p_i = 1 / (1 + exp(-lin_pred))
    prob = 1.0 / (1.0 + np.exp(-lin_pred))
    
    # 5) Draw binary outcomes y from Bernoulli(prob)
    y_vals = np.random.binomial(1, prob)
    
    # 6) Create a DataFrame for features, plus an intercept column
    df = pd.DataFrame(X_base, columns=[f"x{i+1}" for i in range(p)])
    df.insert(0, "const", 1.0)  # intercept

    # 7) Generate weights between 0 and 100
    weights = np.random.uniform(0, 100, size=n)
    df['weight'] = weights  # Add 'weight' column
    
    # 8) Create a Series for y
    y = pd.Series(y_vals, name="y")
    
    return df, y


def create_synthetic_data_linear_regression(seed=42, n=50000, p=15, sigma=1.0):
    """
    Creates synthetic data for linear regression.

    Args:
        seed: Random seed for reproducibility.
        n: Number of samples.
        p: Number of features (excluding the intercept).
        sigma: Standard deviation of the error term.

    Returns:
        Tuple: A DataFrame `df` containing the features (including 'const' and 'weight') 
               and a Series `y` representing the target variable.
    """

    np.random.seed(seed)

    # 1) Generate random features ~ N(0,1)
    X_base = np.random.randn(n, p)

    # 2) Define "true" coefficients (including the intercept)
    #    Let's say 5 features have a non-zero effect.
    betas_true = np.array([2.0, 1.5, -2.0, 0.75, 1.25, -0.5] + [0] * (p - 5))
    # betas_true now includes the intercept (e.g., 2.0) in the first position.

    # 3) Generate weights between 0 and 100
    weights = np.random.uniform(0, 100, size=n)

    # 4) Create a DataFrame for features, plus an intercept column
    df = pd.DataFrame(X_base, columns=[f"x{i + 1}" for i in range(p)])
    df.insert(0, "const", 1.0)  # intercept

    # 5) Linear predictor: X dot betas_true (including intercept)
    #    shape -> (n, )
    lin_pred = df.drop(columns=['const']).dot(betas_true[1:]) + betas_true[0] # Account for intercept in betas_true

    # 6) Generate the target variable y with added noise:
    #    y = linear predictor + error
    #    where error ~ N(0, sigma^2)
    y_vals = lin_pred + np.random.normal(0, sigma, size=n)

    # 7) Add 'weight' column to DataFrame
    df['weight'] = weights

    # 8) Create a Series for y
    y = pd.Series(y_vals, name="y")

    return df, y


def create_synthetic_data_ordinal_logit(seed=42, n_samples=50000, n_features=15, n_classes=3, 
                                      beta_scale=1.0, class_separation=1.0):
    """
    Creates synthetic data for ordinal logistic regression.

    Args:
        seed: Random seed for reproducibility
        n_samples: Number of observations
        n_features: Number of features (excluding intercept)
        n_classes: Number of ordinal classes (3-5 recommended)
        beta_scale: Scale factor for coefficient magnitudes
        class_separation: Controls spread between cutpoints

    Returns:
        Tuple: (df, y) where df contains features + weights, y contains ordinal labels
    """
    np.random.seed(seed)
    
    # 1. Generate features matrix with intercept
    X_base = np.random.randn(n_samples, n_features)
    df = pd.DataFrame(X_base, columns=[f"x{i+1}" for i in range(n_features)])
    df.insert(0, "const", 1.0)

    # 2. Generate true parameters
    n_cutpoints = n_classes - 1
    
    # Coefficients (first 5 features have non-zero effects)
    beta_true = np.zeros(n_features + 1)  # +1 for intercept
    beta_true[0] = 1.0  # Intercept
    beta_true[1:6] = np.array([1.5, -2.0, 0.75, 1.25, -0.5]) * beta_scale
    
    # Cutpoints (sorted for identifiability)
    theta_true = np.sort(np.random.randn(n_cutpoints) * class_separation)

    # 3. Compute linear predictor
    X_mat = df.values
    XB = X_mat @ beta_true  # Shape (n_samples,)

    # 4. Calculate class probabilities using proportional odds model
    z = theta_true[:, None] - XB  # Shape (n_cutpoints, n_samples)
    cumulative_probs = 1 / (1 + np.exp(-z))  # CDF values
    
    # Pad with 0 (left) and 1 (right) for class probabilities
    padded_probs = np.vstack([np.zeros((1, n_samples)),
                             cumulative_probs,
                             np.ones((1, n_samples))])
    
    # Calculate class probabilities via differences
    class_probs = np.diff(padded_probs, axis=0)  # Shape (n_classes, n_samples)
    class_probs = class_probs.T  # Shape (n_samples, n_classes)

    # 5. Generate ordinal labels
    u = np.random.rand(n_samples)
    cumulative_probs = np.cumsum(class_probs, axis=1)
    y = (u[:, None] < cumulative_probs).argmax(axis=1)

    # 6. Add weights and return
    df["weight"] = np.random.uniform(0, 100, size=n_samples)
    return df, pd.Series(y, name="y")

def check_if_features_in(df, features):
    mask = df['Models'].notna()  # Ensure we exclude NaN values    

    for feature in features:
        feature = feature.strip()  # Remove any accidental whitespace

        if feature.startswith("*") and feature.endswith("*"):
            feature = feature.strip("*")
            pattern = rf"{feature}"  # Match anywhere in the string
        
        elif feature.startswith("*"):
            feature = feature.strip("*")
            pattern = rf"{feature}$|\b{feature}\b|_{feature}"  # Ends with, exact match, or after underscore

        elif feature.endswith("*"):
            feature = feature.strip("*")
            pattern = rf"^{feature}|\b{feature}\b|{feature}_"  # Starts with, exact match, or before underscore

        else:
            pattern = rf"\b{feature}\b"  # Exact word match
        
        mask &= df['Models'].str.contains(pattern, case=False, regex=True, na=False)

    return df[mask]
# Example usage
if __name__ == "__main__":
    df, y = create_synthetic_data_logistic(p=15)
    # print(df.head())
    # print(y.head())
    print(df.columns)
    print("df shape:", df.shape)
    print("y shape:", y.shape)

Index(['const', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10',
       'x11', 'x12', 'x13', 'x14', 'x15', 'weight'],
      dtype='object')
df shape: (50000, 17)
y shape: (50000,)


# Test 1

* Compare Exaustive results without `weights` using forced features

## Run Exhaustive Model. Use it as Baseline

In [295]:

df, y = create_synthetic_data_logistic(p=15)
df.rename(columns ={ "x3": 'fico', "x4": 'fico_lt', "x5": "fico_gt",  "x15": "gt_fico"}, inplace=True)
# df.rename(columns ={ "x3": 'fico', "x4": 'fico_lt', "x5": "fico_gt",  "x5": "gt_fico"}, inplace=True)
candidates = df.columns.tolist()
candidates.remove('const')
candidates.remove('weight')
# candidates = df.columns[1:-1].tolist()
results =  bs_pkgs.best_subset_exhaustive(df, y, candidates, method='logistic')

Finished Var Family: 1
Finished Var Family: 2
Finished Var Family: 3
Finished Var Family: 4
Finished Var Family: 5
Finished Var Family: 6
Finished Var Family: 7
Finished Var Family: 8
Finished Var Family: 9
Finished Var Family: 10
Finished Var Family: 11
Finished Var Family: 12
Finished Var Family: 13
Finished Var Family: 14
Finished Var Family: 15
Total Models: 32767


# Filter Exhaustive

* filter models with these features `['x1','x2', '*fico*']`. Must have x1, x2 and at least 1 fico feature (for example)
* Return top 100 models given the conditions above

In [298]:
forced_features =  ['x14', 'x12' , "*fico"]
top = 100

In [299]:

# exhaustive_filter_ = check_if_features_in(results, ['x1', 'x2' , "*fico"])
# print(a_bfs_v2.shape, check_if_features_in(a_bfs_v2, features).shape)
# exhaustive_filter_ = check_if_features_in(results, ['x14','x15', '*fico*'])
# exhaustive_filter_ = check_if_features_in(results, ['x1','x9','x12' ])
exhaustive_filter_ = check_if_features_in(results, forced_features)
# forced_vars=['x1', 'x2' , "*fico*"]
# exhaustive_filter_ = check_if_features_in(results, [])


exhaustive_filter_ = order_models_filed_all(exhaustive_filter_)
exhaustive_filter_['rank'] = exhaustive_filter_.groupby("Var Number")['Scores'].rank(ascending=False)
exhaustive_filter_ = exhaustive_filter_[exhaustive_filter_['rank']<=top]
exhaustive_filter_.drop('rank', axis=1, inplace=True)
exhaustive_filter = exhaustive_filter_.reset_index(drop=True)
# print(exhaustive_filter.to_markdown())
exhaustive_filter

Unnamed: 0,Var Number,Models,Scores
0,3,fico x12 x14,1633.51
1,3,gt_fico x12 x14,5.35
2,4,fico x12 x14 x2,12283.00
3,4,gt_fico x12 x14 x2,10715.95
4,4,fico x1 x12 x14,7685.86
...,...,...,...
911,14,fico_gt fico_lt gt_fico x1 x10 x11 x12 x13 x14...,21838.16
912,14,fico fico_gt gt_fico x1 x10 x11 x12 x13 x14 x2...,19051.80
913,14,fico fico_gt fico_lt gt_fico x10 x11 x12 x13 x...,17306.23
914,14,fico fico_gt fico_lt gt_fico x1 x10 x11 x12 x1...,12676.63


In [301]:
%%time
df, y = create_synthetic_data_logistic(p=15)
df.rename(columns ={ "x3": 'fico', "x4": 'fico_lt', "x5": "fico_gt",  "x15": "gt_fico"}, inplace=True)
# df.rename(columns={"x3": 'fico', "x4": 'fico_lt', "x5": "fico_gt", "x5": "gt_fico"}, inplace=True)

cands = df.columns[1:].tolist()
cands.remove('weight')
 

print(forced_features)
res_weights, c , _ = bs_pkgs.best_subset(df, y, top, start=2, stop=15,  candidates=cands,  forced_vars = forced_features,  method='logistic')
 
res_weights = order_models_filed_all(res_weights)
res_weights = res_weights.reset_index(drop=True)
print(c)
res_weights

['x14', 'x12', '*fico']
Finished Var Family: 2  Skipped
Finished Var Family: 3
Finished Var Family: 4
Finished Var Family: 5
Finished Var Family: 6
Finished Var Family: 7
Finished Var Family: 8
Finished Var Family: 9
Finished Var Family: 10
Finished Var Family: 11
Finished Var Family: 12
Finished Var Family: 13
Finished Var Family: 14
Finished Var Family: 15
1713
CPU times: total: 828 ms
Wall time: 288 ms


Unnamed: 0,Var Number,Models,Scores
0,3,fico x12 x14,1633.51
1,3,gt_fico x12 x14,5.35
2,4,fico x12 x14 x2,12283.00
3,4,gt_fico x12 x14 x2,10715.95
4,4,fico x1 x12 x14,7685.86
...,...,...,...
911,14,fico_gt fico_lt gt_fico x1 x10 x11 x12 x13 x14...,21838.16
912,14,fico fico_gt gt_fico x1 x10 x11 x12 x13 x14 x2...,19051.80
913,14,fico fico_gt fico_lt gt_fico x10 x11 x12 x13 x...,17306.23
914,14,fico fico_gt fico_lt gt_fico x1 x10 x11 x12 x1...,12676.63


# Compare exhaustive and new algo

In [302]:
compare_dataframes(exhaustive_filter,res_weights)

Dataset Match


# Test 2

* Repeat step above with weights

In [304]:
forced_features

['x14', 'x12', '*fico']

In [335]:
df, y = create_synthetic_data_logistic(p=15)
df.rename(columns ={ "x3": 'fico', "x4": 'fico_lt', "x5": "fico_gt",  "x5": "gt_fico"}, inplace=True)
candidates = df.columns.tolist()
candidates.remove('const')
candidates.remove('weight')
print(candidates)
# candidates = df.columns[1:-1].tolist()
results =  bs_pkgs.best_subset_exhaustive(df, y, candidates, weights=np.array(df['weight']), method='logistic')
 

['x1', 'x2', 'fico', 'fico_lt', 'gt_fico', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15']
Finished Var Family: 1
Finished Var Family: 2
Finished Var Family: 3
Finished Var Family: 4
Finished Var Family: 5
Finished Var Family: 6
Finished Var Family: 7
Finished Var Family: 8
Finished Var Family: 9
Finished Var Family: 10
Finished Var Family: 11
Finished Var Family: 12
Finished Var Family: 13
Finished Var Family: 14
Finished Var Family: 15
Total Models: 32767


In [336]:
# print(a_bfs_v2.shape, check_if_features_in(a_bfs_v2, features).shape)
exhaustive_filter_ = check_if_features_in(results, forced_features)
exhaustive_filter_ = order_models_filed_all(exhaustive_filter_)
exhaustive_filter_['rank'] = exhaustive_filter_.groupby("Var Number")['Scores'].rank(ascending=False)
exhaustive_filter_ = exhaustive_filter_[exhaustive_filter_['rank']<=top]
exhaustive_filter_.drop('rank', axis=1, inplace=True)
exhaustive_filter = exhaustive_filter_.reset_index(drop=True)
exhaustive_filter.shape
exhaustive_filter_

Unnamed: 0,Var Number,Models,Scores
350,3,fico x12 x14,83606.20
450,3,gt_fico x12 x14,31183.82
1000,4,fico x12 x14 x2,618702.95
1100,4,gt_fico x12 x14 x2,569409.97
714,4,fico x1 x12 x14,387969.05
...,...,...,...
32763,14,fico_lt gt_fico x1 x10 x11 x12 x13 x14 x15 x2 ...,1093988.77
32762,14,fico gt_fico x1 x10 x11 x12 x13 x14 x15 x2 x6 ...,959474.49
32765,14,fico fico_lt gt_fico x10 x11 x12 x13 x14 x15 x...,868804.34
32764,14,fico fico_lt gt_fico x1 x10 x11 x12 x13 x14 x1...,635646.24


In [337]:
%%time
df, y = create_synthetic_data_logistic(p=15)
df.rename(columns ={ "x3": 'fico', "x4": 'fico_lt', "x5": "fico_gt",  "x5": "gt_fico"}, inplace=True)
cands = df.columns[1:].tolist()
cands.remove('weight')
print(cands)
res_weights, _ , _ = bs_pkgs.best_subset(df, y, 100, start=2, stop=15,  candidates=cands,  forced_vars=forced_features, weights=df['weight'], method='logistic'  )
res_weights = order_models_filed_all(res_weights)
res_weights = res_weights.reset_index(drop=True)
 
res_weights 

['x1', 'x2', 'fico', 'fico_lt', 'gt_fico', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15']
Finished Var Family: 2  Skipped
Finished Var Family: 3
Finished Var Family: 4
Finished Var Family: 5
Finished Var Family: 6
Finished Var Family: 7
Finished Var Family: 8
Finished Var Family: 9
Finished Var Family: 10
Finished Var Family: 11
Finished Var Family: 12
Finished Var Family: 13
Finished Var Family: 14
Finished Var Family: 15
CPU times: total: 875 ms
Wall time: 401 ms


Unnamed: 0,Var Number,Models,Scores
0,3,fico x12 x14,83606.20
1,3,gt_fico x12 x14,31183.82
2,4,fico x12 x14 x2,618702.95
3,4,gt_fico x12 x14 x2,569409.97
4,4,fico x1 x12 x14,387969.05
...,...,...,...
911,14,fico_lt gt_fico x1 x10 x11 x12 x13 x14 x15 x2 ...,1093988.77
912,14,fico gt_fico x1 x10 x11 x12 x13 x14 x15 x2 x6 ...,959474.49
913,14,fico fico_lt gt_fico x10 x11 x12 x13 x14 x15 x...,868804.34
914,14,fico fico_lt gt_fico x1 x10 x11 x12 x13 x14 x1...,635646.24


In [338]:
compare_dataframes(exhaustive_filter,res_weights)

Dataset Match


# Test 3: Oridinal - No Weights

In [339]:
df, y = create_synthetic_data_ordinal_logit(n_features=15, n_classes=3)
cands = df.columns.tolist()
cands.remove('const')
cands.remove('weight')
print(cands)
results =  bs_pkgs.best_subset_exhaustive(df, y, cands,  method='ordinal')
 
# print(a_bfs_v2.shape, check_if_features_in(a_bfs_v2, features).shape)
exhaustive_filter_ = check_if_features_in(results, ['x1','x2'])
exhaustive_filter_ = order_models_filed_all(exhaustive_filter_)
exhaustive_filter_['rank'] = exhaustive_filter_.groupby("Var Number")['Scores'].rank(ascending=False)
exhaustive_filter_ = exhaustive_filter_[exhaustive_filter_['rank']<=100]
exhaustive_filter_.drop('rank', axis=1, inplace=True)
exhaustive_filter = exhaustive_filter_.reset_index(drop=True)

['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15']
Finished Var Family: 1
Finished Var Family: 2
Finished Var Family: 3
Finished Var Family: 4
Finished Var Family: 5
Finished Var Family: 6
Finished Var Family: 7
Finished Var Family: 8
Finished Var Family: 9
Finished Var Family: 10
Finished Var Family: 11
Finished Var Family: 12
Finished Var Family: 13
Finished Var Family: 14
Finished Var Family: 15
Total Models: 32767


In [340]:
df, y = create_synthetic_data_ordinal_logit(n_features=15, n_classes=3)
cands = df.columns.tolist()
cands.remove('const')
cands.remove('weight')
# print(cands)
res_weights, _ , _= bs_pkgs.best_subset (df, y, 100, start=2, stop=15,  candidates=cands,  forced_vars=['x1', 'x2'], method='ordinal' )
res_weights = order_models_filed_all(res_weights)
res_weights = res_weights.reset_index(drop=True)

Finished Var Family: 2
Finished Var Family: 3
Finished Var Family: 4
Finished Var Family: 5
Finished Var Family: 6
Finished Var Family: 7
Finished Var Family: 8
Finished Var Family: 9
Finished Var Family: 10
Finished Var Family: 11
Finished Var Family: 12
Finished Var Family: 13
Finished Var Family: 14
Finished Var Family: 15


In [341]:
compare_dataframes(exhaustive_filter,res_weights)

Dataset Match


# Test 4: OLS - No Weights

In [342]:
df, y = create_synthetic_data_linear_regression(p=15)
candidates = df.columns.tolist()
candidates.remove('const')
candidates.remove('weight')
# candidates = df.columns[1:-1].tolist()
results =  bs_pkgs.best_subset_exhaustive(df, y, candidates, method='ols')
 
# print(a_bfs_v2.shape, check_if_features_in(a_bfs_v2, features).shape)
exhaustive_filter_ = check_if_features_in(results, ['x1','x2'])
exhaustive_filter_ = order_models_filed_all(exhaustive_filter_)
exhaustive_filter_['rank'] = exhaustive_filter_.groupby("Var Number")['Scores'].rank(ascending=False)
exhaustive_filter_ = exhaustive_filter_[exhaustive_filter_['rank']<=100]
exhaustive_filter_.drop('rank', axis=1, inplace=True)
exhaustive_filter = exhaustive_filter_.reset_index(drop=True)
exhaustive_filter.shape

Finished Var Family: 1
Finished Var Family: 2
Finished Var Family: 3
Finished Var Family: 4
Finished Var Family: 5
Finished Var Family: 6
Finished Var Family: 7
Finished Var Family: 8
Finished Var Family: 9
Finished Var Family: 10
Finished Var Family: 11
Finished Var Family: 12
Finished Var Family: 13
Finished Var Family: 14
Finished Var Family: 15
Total Models: 32767


(984, 3)

In [343]:
df, y = create_synthetic_data_linear_regression(p=15)
cands = df.columns.tolist()
cands.remove('const')
cands.remove('weight')
 
res_weights, _ , _= bs_pkgs.best_subset(df, y, 100, start=2, stop=15,  candidates=cands,  forced_vars=['x1', 'x2'], method='ols' )
res_weights = order_models_filed_all(res_weights)
res_weights = res_weights.reset_index(drop=True)

Finished Var Family: 2
Finished Var Family: 3
Finished Var Family: 4
Finished Var Family: 5
Finished Var Family: 6
Finished Var Family: 7
Finished Var Family: 8
Finished Var Family: 9
Finished Var Family: 10
Finished Var Family: 11
Finished Var Family: 12
Finished Var Family: 13
Finished Var Family: 14
Finished Var Family: 15




In [344]:
compare_dataframes(exhaustive_filter,res_weights)

Dataset Match


# Test 3: No forced features

In [354]:
top = 100
df, y = create_synthetic_data_logistic(p=15)
df.rename(columns ={ "x3": 'fico', "x4": 'fico_lt', "x5": "fico_gt",  "x15": "gt_fico"}, inplace=True)
# df.rename(columns ={ "x3": 'fico', "x4": 'fico_lt', "x5": "fico_gt",  "x5": "gt_fico"}, inplace=True)
candidates = df.columns.tolist()
candidates.remove('const')
candidates.remove('weight')
# candidates = df.columns[1:-1].tolist()
results =  bs_pkgs.best_subset_exhaustive(df, y, candidates, method='logistic')
exhaustive_filter_ = order_models_filed_all(results)
exhaustive_filter_['rank'] = exhaustive_filter_.groupby("Var Number")['Scores'].rank(ascending=False)
exhaustive_filter_ = exhaustive_filter_[exhaustive_filter_['rank']<=top]
exhaustive_filter_.drop('rank', axis=1, inplace=True)
exhaustive_filter = exhaustive_filter_.reset_index(drop=True)
exhaustive_filter.shape
exhaustive_filter

Finished Var Family: 1
Finished Var Family: 2
Finished Var Family: 3
Finished Var Family: 4
Finished Var Family: 5
Finished Var Family: 6
Finished Var Family: 7
Finished Var Family: 8
Finished Var Family: 9
Finished Var Family: 10
Finished Var Family: 11
Finished Var Family: 12
Finished Var Family: 13
Finished Var Family: 14
Finished Var Family: 15
Total Models: 32767


Unnamed: 0,Var Number,Models,Scores
1,1,x2,10709.42
0,1,x1,6055.01
3,1,fico_lt,4357.12
2,1,fico,1628.91
4,1,fico_gt,637.60
...,...,...,...
32763,14,fico_gt fico_lt gt_fico x1 x10 x11 x12 x13 x14...,21838.16
32762,14,fico fico_gt gt_fico x1 x10 x11 x12 x13 x14 x2...,19051.80
32765,14,fico fico_gt fico_lt gt_fico x10 x11 x12 x13 x...,17306.23
32764,14,fico fico_gt fico_lt gt_fico x1 x10 x11 x12 x1...,12676.63


In [355]:
%%time
df, y = create_synthetic_data_logistic(p=15)
df.rename(columns ={ "x3": 'fico', "x4": 'fico_lt', "x5": "fico_gt",  "x15": "gt_fico"}, inplace=True)
cands = df.columns[1:].tolist()
cands.remove('weight')
print(cands)
res_noweights, _ , _ = bs_pkgs.best_subset(df, y, 100, start=1, stop=15,  candidates=cands,  forced_vars=[], method='logistic'  )
res_noweights = order_models_filed_all(res_noweights)
res_noweights = res_noweights.reset_index(drop=True)
 
res_noweights 

['x1', 'x2', 'fico', 'fico_lt', 'fico_gt', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'gt_fico']
Finished Var Family: 1
Finished Var Family: 2
Finished Var Family: 3
Finished Var Family: 4
Finished Var Family: 5
Finished Var Family: 6
Finished Var Family: 7
Finished Var Family: 8
Finished Var Family: 9
Finished Var Family: 10
Finished Var Family: 11
Finished Var Family: 12
Finished Var Family: 13
Finished Var Family: 14
Finished Var Family: 15
CPU times: total: 656 ms
Wall time: 203 ms


Unnamed: 0,Var Number,Models,Scores
0,1,x2,10709.42
1,1,x1,6055.01
2,1,fico_lt,4357.12
3,1,fico,1628.91
4,1,fico_gt,637.60
...,...,...,...
1226,14,fico_gt fico_lt gt_fico x1 x10 x11 x12 x13 x14...,21838.16
1227,14,fico fico_gt gt_fico x1 x10 x11 x12 x13 x14 x2...,19051.80
1228,14,fico fico_gt fico_lt gt_fico x10 x11 x12 x13 x...,17306.23
1229,14,fico fico_gt fico_lt gt_fico x1 x10 x11 x12 x1...,12676.63


In [358]:
compare_dataframes(exhaustive_filter,res_noweights)


Dataset Match
