## Imports

In [1]:
import sys
import pandas as pd
import numpy as np
import pickle

# Import my libs
sys.path.append('../')

from utils.logger import get_logger
from src.features.balance import BalanceMixin
from src.features.correlation import HighCorrelation_filter
from src.features.variance import NearZeroVar_filter, LowVar_Filter

modelisationTuple

In [2]:
with open('../data/processed/modelling_tuple', 'rb') as f:
    M = pickle.load(f)

## SmokeTest

#### Logger

In [3]:
logger = get_logger("test")

In [4]:
logger.debug("Ceci est un message de débogage.")
logger.info("Ceci est un message d'information.")
logger.warning("Ceci est un message d'avertissement.")
logger.error("Ceci est un message d'erreur.")
logger.critical("Ceci est un message critique.")

2023-10-20 15:50:26,437 [test] DEBUG    Ceci est un message de débogage.
2023-10-20 15:50:26,439 [test] INFO     Ceci est un message d'information.
2023-10-20 15:50:26,443 [test] ERROR    Ceci est un message d'erreur.
2023-10-20 15:50:26,444 [test] CRITICAL Ceci est un message critique.


#### BalanceMixin

In [5]:
print("Taille du jeu de données d'entrainement : ", M.train_X.shape[0])

Taille du jeu de données d'entrainement :  217877


In [6]:
# prevalence
y = M.train_y
out = pd.concat([
    np.round(y.value_counts(normalize=True, ascending=False).rename('normalized'), 4) * 100,
    y.value_counts(normalize=False, ascending=False).rename('number'),
], axis=1)
out

Unnamed: 0,normalized,number
0.0,99.82,217485
1.0,0.18,392


In [7]:
balance = BalanceMixin()

In [8]:
dir(balance)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_balance']

In [9]:
help(balance._balance)

Help on method _balance in module src.features.balance:

_balance(X: pandas.core.frame.DataFrame, y: numpy.ndarray) -> Tuple[pandas.core.frame.DataFrame, numpy.ndarray] method of src.features.balance.BalanceMixin instance
    Rebalance an X dataframe by undersampling the majority class
    
    Args:
        X (pd.DataFrame): The dataframe to be downsampled with respect to y.
        y (np.array): The one-hot target vector to use to downsample X.
    
    Returns:
        pd.DataFrame, np.ndarray: Downsampled X and y.



In [10]:
x, y = balance._balance(M.train_X, M.train_y)

y = pd.Series(y)
# new prevalence
out = pd.concat([
    np.round(y.value_counts(normalize=True, ascending=False).rename('normalized'), 4) * 100,
    y.value_counts(normalize=False, ascending=False).rename('number'),
], axis=1)
out

Unnamed: 0,normalized,number
0.0,50.0,392
1.0,50.0,392


#### Corrélation

In [11]:
# pour tester le correlation filter, on ajoute une variable qui est un multiple de V1
x = M.train_X
x['test2'] = 2 * x['V1']

In [12]:
corr_filter = HighCorrelation_filter(threshold = 0.95, equisample = True)

In [13]:
dir(corr_filter)

['GROUP_NAME',
 '_HEADER',
 '__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_balance',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_cols',
 '_get_default_requests',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_removed',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sklearn_auto_wrap_output_keys',
 '_validate_data',
 '_validate_params',
 'equisample',
 'fit',
 'fit_transform',
 'get_feature_names_out',
 'get_metadata_routing',
 'get_params',
 'get_report_group',
 'group_name',
 'set_output',

In [14]:
help(corr_filter.fit)

Help on method fit in module src.features.correlation:

fit(X: pandas.core.frame.DataFrame, y: numpy.ndarray = None) method of src.features.correlation.HighCorrelation_filter instance
    Extract the features that we will keep.
    
    Args:
        X (pd.DataFrame): the dataframe to remove the too correlated features form
        y (np.ndarray): The target vector.



In [15]:
corr_filter = corr_filter.fit(X = x)
corr_filter._removed

['test2']

#### Variance

In [16]:
# pour tester le variance filter, on ajoute une variable de variance 0
x = M.train_X
x['test'] = 1.

##### NearZeroVar_filter

In [17]:
var_filter = NearZeroVar_filter(frequency_ratio = 95/5, unique_cut = 0.05, equisample = True)

In [18]:
dir(var_filter)

['GROUP_NAME',
 '_HEADER',
 '__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_balance',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_cols',
 '_get_default_requests',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_removed',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sklearn_auto_wrap_output_keys',
 '_validate_data',
 '_validate_params',
 'equisample',
 'fit',
 'fit_transform',
 'frequency_ratio',
 'get_feature_names_out',
 'get_metadata_routing',
 'get_params',
 'get_report_group',
 'group_n

In [19]:
help(var_filter.fit)

Help on method fit in module src.features.variance:

fit(X: pandas.core.frame.DataFrame, y: numpy.ndarray = None) method of src.features.variance.NearZeroVar_filter instance
    Extract the features that we will keep
    
    Args:
        X (pd.DataFrame): the dataframe
        y (np.ndarray): The target vector



In [20]:
var_filter = var_filter.fit(X = x)
var_filter._removed

['test']

##### LowVar_Filter

In [21]:
low_var_filter = LowVar_Filter(threshold = 0.01, equisample = True)

In [22]:
dir(low_var_filter)

['GROUP_NAME',
 '_HEADER',
 '__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_balance',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_cols',
 '_get_default_requests',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_removed',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sklearn_auto_wrap_output_keys',
 '_validate_data',
 '_validate_params',
 'equisample',
 'fit',
 'fit_transform',
 'get_feature_names_out',
 'get_metadata_routing',
 'get_params',
 'get_report_group',
 'group_name',
 'set_output',

In [23]:
help(low_var_filter.fit)

Help on method fit in module src.features.variance:

fit(X: pandas.core.frame.DataFrame, y=None) method of src.features.variance.LowVar_Filter instance
    Extract the features that we will keep
    
    Args:
        X (pd.DataFrame): The DataFrame to remove columns from.



In [24]:
low_var_filter = low_var_filter.fit(X = x)
low_var_filter._removed

['test']