# Final Modeling

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Dataset craetion

In [2]:
df = pd.read_feather('removed_duplicated_columns.feather')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 726288 entries, 0 to 726287
Columns: 767 entries, HKTLMYY to TLJYWBE
dtypes: float64(742), int64(21), object(4)
memory usage: 4.2+ GB


In [3]:
target = 'TLJYWBE'
df[target] = df[target].apply(lambda v: v >= 1e-5)
df[target].value_counts()

TLJYWBE
False    726224
True         64
Name: count, dtype: int64

In [4]:
features = [c for c in df.columns if c != target]
training_features, test_features, train_target, test_target = train_test_split(df[features], df[target], test_size=0.3, stratify=df[target])

In [5]:
print(f'Training set shape: {training_features.shape}')
print(f'Training set target shape: {train_target.shape}')
print(f'Test set shape: {test_features.shape}')
print(f'Training set target shape: {test_target.shape}')

Training set shape: (508401, 766)
Training set target shape: (508401,)
Test set shape: (217887, 766)
Training set target shape: (217887,)


In [6]:
test_target.value_counts()

TLJYWBE
False    217868
True         19
Name: count, dtype: int64

## Modeling

In [7]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RepeatedStratifiedKFold

In [8]:
dtypes = training_features.dtypes
dtypes.value_counts()

float64    741
int64       21
object       4
Name: count, dtype: int64

In [9]:
int_features = dtypes[dtypes == 'int64'].index
float_features = dtypes[dtypes == 'float64'].index
cat_features = dtypes[dtypes == 'object'].index
len(int_features), len(float_features), len(cat_features)

(21, 741, 4)

In [10]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('int', numeric_transformer, int_features),
        ('float', numeric_transformer, float_features),
        ('cat', categorical_transformer, cat_features)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', 'passthrough')  
                          ])

## Balanced Random Forest

In [11]:
param_grid = [
    {
        'classifier': [BalancedRandomForestClassifier(sampling_strategy='all', replacement=True)],
        'classifier__max_depth': [None, 10, 20],
        'classifier__n_estimators': [100, 150, 200],
        'classifier__min_samples_leaf': [2, 4, 8],
    }
]

In [12]:
from src.custom_metrics import precision_at_full_recall_scorer

In [13]:
%%time
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)
search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=precision_at_full_recall_scorer, n_jobs=2)
search.fit(training_features, train_target)

CPU times: user 1min 46s, sys: 14.7 s, total: 2min
Wall time: 2h 47min 14s


In [39]:
print("Best parameter (CV score=%0.5f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.00070):
{'classifier': BalancedRandomForestClassifier(max_depth=10, min_samples_leaf=4,
                               replacement=True, sampling_strategy='all'), 'classifier__max_depth': 10, 'classifier__min_samples_leaf': 4, 'classifier__n_estimators': 100}


In [15]:
search.best_estimator_

In [50]:
pd.set_option('display.max_colwidth', 200)
cols = ['param_classifier__min_samples_leaf','param_classifier__n_estimators','param_classifier__max_depth', 'mean_test_score','std_test_score']
results = pd.DataFrame(search.cv_results_).sort_values(by='mean_test_score',ascending=False)
results.head(10)[cols]

Unnamed: 0,param_classifier__min_samples_leaf,param_classifier__n_estimators,param_classifier__max_depth,mean_test_score,std_test_score
12,4,100,10.0,0.0007,0.000726
14,4,200,10.0,0.000698,0.000391
23,4,200,20.0,0.000678,0.000479
10,2,150,10.0,0.000636,0.000592
18,2,100,20.0,0.000614,0.000562
1,2,150,,0.000612,0.000439
19,2,150,20.0,0.000608,0.000503
17,8,200,10.0,0.000599,0.00072
22,4,150,20.0,0.000593,0.000547
26,8,200,20.0,0.000583,0.00053


## Evaluating on the test set

In [19]:
from src.custom_metrics import precision_at_full_recall

In [58]:
test_predicted_probs = search.best_estimator_.predict_proba(test_features)[:,1]
precision, thd = precision_at_full_recall(test_target, test_predicted_probs, return_thd=True)
precision, thd

(0.00023905084233967868, 0.3735299422799423)

In [60]:
print(f'True positive to False positive ratio: 1:{1 / precision - 1:.0f}')
print(f'Percentage of tests that will be skipped {100 * (1 - 19 * 4182 / 217686): .0f}%')

True positive to False positive ratio: 1:4182
Percentage of tests that will be skipped  63%


Conclusion:

* Using a thd of 0.373 The classifier is able to recall all the defective tests.
* At that thd, the precision is 0.000239, or 1:4182
* The valid to defective ratio of the test set is 1:11465
* Thus by using the classifier, only 79458 samples will need to run the **TLJYWBE** test, instead of 217686
* This is a 63% reduction in the cost of the test.

# More

### RUSBoostClassifier

In [89]:
from imblearn.ensemble import RUSBoostClassifier
model = RUSBoostClassifier(n_estimators=200, algorithm='SAMME.R', random_state=0)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

In [90]:
pipeline.fit(training_features, train_target)

In [91]:
y_pred = pipeline.predict_proba(test_features)[:,1]
precision, thd = precision_at_full_recall(test_target, y_pred, return_thd=True)
precision, thd

(0.0004325948862730812, 0.14368601105519216)

In [92]:
inv_prec = 1 / precision - 1
print(f'True positive to False positive ratio: 1:{1 / precision - 1:.0f}')
print(f'Percentage of tests that will be skipped {100 * (1 - 19 * inv_prec / 217686): .0f}%')

True positive to False positive ratio: 1:2311
Percentage of tests that will be skipped  80%


Conclusion:

* Using a thd of 0.143 The classifier is able to recall all the defective tests.
* At that thd, the precision is 0.000432, or 1:2311
* The valid to defective ratio of the test set is 1:11465
* Thus by using the classifier, only 43909 samples will need to run the **TLJYWBE** test, instead of 217686
* This is a 80% reduction in the cost of the test.

### EasyEnsembleClassifier

In [86]:
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, sampling_strategy=0.2)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', eec)])
pipeline.fit(training_features, train_target)

In [87]:
y_pred = pipeline.predict_proba(test_features)[:,1]
precision, thd = precision_at_full_recall(test_target, y_pred, return_thd=True)
precision, thd

(0.0005907776499486957, 0.4220245747734167)

In [88]:
inv_prec = 1 / precision - 1
print(f'True positive to False positive ratio: 1:{1 / precision - 1:.0f}')
print(f'Percentage of tests that will be skipped {100 * (1 - 19 * inv_prec / 217686): .0f}%')

True positive to False positive ratio: 1:1692
Percentage of tests that will be skipped  85%


Conclusion:

* Using a thd of 0.422 The classifier is able to recall all the defective tests.
* At that thd, the precision is 0.000590, or 1:1692
* The valid to defective ratio of the test set is 1:11465
* Thus by using the classifier, only 32148 samples will need to run the **TLJYWBE** test, instead of 217686
* This is a 85% reduction in the cost of the test.