# Final Modeling

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Dataset craetion

In [16]:
df = pd.read_feather('dataset_no_duplicates_no_univalue_no_correlated_columns.feather')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 709518 entries, 0 to 709517
Columns: 368 entries, HKTLMYY to TLJYWBE
dtypes: float64(347), int64(21)
memory usage: 1.9 GB


In [17]:
target = 'TLJYWBE'
df[target] = df[target].apply(lambda v: v >= 1e-5)
df[target].value_counts()

False    709454
True         64
Name: TLJYWBE, dtype: int64

In [18]:
features = [c for c in df.columns if c != target]
training_features, test_features, train_target, test_target = train_test_split(df[features], df[target], test_size=0.3, stratify=df[target])

In [19]:
print(f'Training set shape: {training_features.shape}')
print(f'Training set target shape: {train_target.shape}')
print(f'Test set shape: {test_features.shape}')
print(f'Training set target shape: {test_target.shape}')

Training set shape: (496662, 367)
Training set target shape: (496662,)
Test set shape: (212856, 367)
Training set target shape: (212856,)


In [20]:
test_target.value_counts()

False    212837
True         19
Name: TLJYWBE, dtype: int64

## Modeling

In [21]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RepeatedStratifiedKFold

In [22]:
dtypes = training_features.dtypes
dtypes.value_counts()

float64    346
int64       21
dtype: int64

In [23]:
int_features = dtypes[dtypes == 'int64'].index
float_features = dtypes[dtypes == 'float64'].index
cat_features = dtypes[dtypes == 'object'].index
len(int_features), len(float_features), len(cat_features)

(21, 346, 0)

In [24]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('int', numeric_transformer, int_features),
        ('float', numeric_transformer, float_features),
        ('cat', categorical_transformer, cat_features)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', 'passthrough')  
                          ])

## Balanced Random Forest

In [43]:
param_grid = [
    {
        'classifier': [BalancedRandomForestClassifier(replacement=True)],
        'classifier__max_depth': [10],
        'classifier__n_estimators': [150, 200],
        'classifier__min_samples_leaf': [4],
        'classifier__sampling_strategy': [0.05, 0.01, 0.2]
    }
]

In [44]:
from src.custom_metrics import precision_at_full_recall, precision_at_full_recall_scorer

In [45]:
%%time
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)
search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=precision_at_full_recall_scorer, n_jobs=4)
search.fit(training_features, train_target)

CPU times: user 57.8 s, sys: 5.82 s, total: 1min 3s
Wall time: 12min 46s


In [50]:
print("Best parameter (CV score=%0.5f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.00171):
{'classifier': BalancedRandomForestClassifier(max_depth=10, min_samples_leaf=4,
                               n_estimators=200, replacement=True,
                               sampling_strategy=0.01), 'classifier__max_depth': 10, 'classifier__min_samples_leaf': 4, 'classifier__n_estimators': 200, 'classifier__sampling_strategy': 0.01}


In [51]:
pd.set_option('display.max_colwidth', 200)
cols = ['param_classifier__sampling_strategy','param_classifier__n_estimators','param_classifier__max_depth', 'mean_test_score','std_test_score']
results = pd.DataFrame(search.cv_results_).sort_values(by='mean_test_score',ascending=False)
results.head(10)[cols]

Unnamed: 0,param_classifier__sampling_strategy,param_classifier__n_estimators,param_classifier__max_depth,mean_test_score,std_test_score
4,0.01,200,10,0.001715,0.001307
3,0.05,200,10,0.001665,0.001087
1,0.01,150,10,0.001539,0.00111
0,0.05,150,10,0.001461,0.000681
5,0.2,200,10,0.001434,0.000946
2,0.2,150,10,0.001269,0.000651


In [52]:
test_predicted_probs = search.best_estimator_.predict_proba(test_features)[:,1]
precision, thd = precision_at_full_recall(test_target, test_predicted_probs, return_thd=True)
precision, thd

(0.00016037138636843215, 0.002785758626516402)

In [53]:
inv_prec = 1 / precision - 1
print(f'True positive to False positive ratio: 1:{1 / precision - 1:.0f}')
print(f'Percentage of tests that will be skipped {100 * (1 - 19 * inv_prec / 217686): .0f}%')

True positive to False positive ratio: 1:6235
Percentage of tests that will be skipped  46%


## Using EasyEnsembleClassifier

In [58]:
%%time
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, sampling_strategy=0.01)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', eec)])
pipeline.fit(training_features, train_target)

CPU times: user 1min 3s, sys: 5.8 s, total: 1min 9s
Wall time: 1min 9s


In [59]:
%%time
y_pred = pipeline.predict_proba(test_features)[:,1]
precision, thd = precision_at_full_recall(test_target, y_pred, return_thd=True)
precision, thd

CPU times: user 57.4 s, sys: 28.7 s, total: 1min 26s
Wall time: 1min 26s


(0.00013130342840162262, 0.31475428639746383)

In [60]:
inv_prec = 1 / precision - 1
print(f'True positive to False positive ratio: 1:{1 / precision - 1:.0f}')
print(f'Percentage of tests that will be skipped {100 * (1 - 19 * inv_prec / 217686): .0f}%')

True positive to False positive ratio: 1:7615
Percentage of tests that will be skipped  34%


Conclusion:

* Using a thd of 0.422 The classifier is able to recall all the defective tests.
* At that thd, the precision is 0.0005907776499486957, or 1:1692
* The valid to defective ratio of the test set is 1:11465
* Thus by using the classifier, only 32148 samples will need to run the **TLJYWBE** test, instead of 217686
* This is a 85% reduction in the cost of the test.

### RUSBoostClassifier

In [None]:
from imblearn.ensemble import RUSBoostClassifier
model = RUSBoostClassifier(n_estimators=500, algorithm='SAMME.R', random_state=1)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

In [None]:
%%time
pipeline.fit(training_features, train_target)

In [None]:
%%time
y_pred = pipeline.predict_proba(test_features)[:,1]
precision, thd = precision_at_full_recall(test_target, y_pred, return_thd=True)
precision, thd

In [None]:
inv_prec = 1 / precision - 1
print(f'True positive to False positive ratio: 1:{1 / precision - 1:.0f}')
print(f'Percentage of tests that will be skipped {100 * (1 - 19 * inv_prec / 217686): .0f}%')

Conclusion:

* Using a thd of 0.422 The classifier is able to recall all the defective tests.
* At that thd, the precision is 0.0005907776499486957, or 1:1692
* The valid to defective ratio of the test set is 1:11465
* Thus by using the classifier, only 32148 samples will need to run the **TLJYWBE** test, instead of 217686
* This is a 85% reduction in the cost of the test.

## Fitting with resampled data

In [69]:
import numpy as np
X_resampled = np.load('X_resampled.npy')
y_resampled = np.load('y_resampled.npy')

In [69]:
# Load the transformed test data
X_test_transformed = np.load('X_test_transformed.npy')
test_target = np.load('test_target.npy')

In [69]:
# Verify shapes:
print(f"Loaded resampled training set shape: {X_resampled.shape}")
print(f"Loaded resampled target shape: {y_resampled.shape}")
print(f"Loaded transformed test set shape: {X_test_transformed.shape}")
print(f"Loaded test target shape: {test_target.shape}")

Loaded resampled training set shape: (595940, 367)
Loaded resampled target shape: (595940,)
Loaded transformed test set shape: (212856, 367)
Loaded test target shape: (212856,)


### BalancedRandomForestClassifier

In [16]:
param_grid = [
    {
        'max_depth': [None, 10, 20],
        'n_estimators': [150, 200],
        'min_samples_leaf': [2, 4, 8],
        'classifier__sampling_strategy': [0.05, 0.01, 0.2]
    }
]

In [17]:
from src.custom_metrics import precision_at_full_recall_scorer, precision_at_full_recall

In [18]:
%%time
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)
search = GridSearchCV(estimator=BalancedRandomForestClassifier(sampling_strategy='all', replacement=True), 
                      param_grid=param_grid, cv=cv, scoring=precision_at_full_recall_scorer, n_jobs=-1)
search.fit(X_resampled, y_resampled)



CPU times: user 21min 36s, sys: 25.9 s, total: 22min 2s
Wall time: 3h 42min 41s


In [19]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.999):
{'max_depth': None, 'min_samples_leaf': 8, 'n_estimators': 200}


In [29]:
pd.set_option('display.max_colwidth', 200)
cols = ['param_min_samples_leaf','param_n_estimators','param_max_depth', 'mean_test_score','std_test_score']
results = pd.DataFrame(search.cv_results_).sort_values(by='mean_test_score',ascending=False)
results.head(10)[cols]

Unnamed: 0,param_min_samples_leaf,param_n_estimators,param_max_depth,mean_test_score,std_test_score
8,8,200,,0.998944,0.000557
1,2,150,,0.998844,0.00042
4,4,150,,0.998794,0.000565
5,4,200,,0.998779,0.000594
7,8,150,,0.998774,0.000696
3,4,100,,0.998723,0.000657
6,8,100,,0.998708,0.000517
2,2,200,,0.998608,0.000517
0,2,100,,0.998573,0.000549
24,8,100,20.0,0.998122,0.001102


### Test model

In [21]:
test_predicted_probs = search.best_estimator_.predict_proba(X_test_transformed)[:,1]

In [44]:
precision, thd = precision_at_full_recall(test_target, test_predicted_probs, return_thd=True)
precision, thd

(0.0002911431198283788, 0.0013636363636363635)

In [50]:
print(f'True positive to False positive ratio: 1:{1 / precision - 1:.0f}')
print(f'Percentage of tests that will be skipped {100 * (1 - 19 * 3434 / 217686): .0f}%')

True positive to False positive ratio: 1:3434
Percentage of tests that will be skipped  70%


Conclusion:

* Using a thd of 0.0013 The classifier is able to recall all the defective tests.
* At that thd, the precision is 0.000239, or 1:3434
* The valid to defective ratio of the test set is 1:11465
* Thus by using the classifier, only 79458 samples will need to run the **TLJYWBE** test, instead of 217686
* This is a 70% reduction in the cost of the test.