# Final Modeling

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Dataset craetion

In [3]:
df = pd.read_feather('dataset_no_duplicates_no_univalue_no_correlated_columns.feather')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 709518 entries, 0 to 709517
Columns: 368 entries, HKTLMYY to TLJYWBE
dtypes: float64(347), int64(21)
memory usage: 1.9 GB


In [4]:
target = 'TLJYWBE'
df[target] = df[target].apply(lambda v: v >= 1e-5)
df[target].value_counts()

False    709454
True         64
Name: TLJYWBE, dtype: int64

In [5]:
features = [c for c in df.columns if c != target]
training_features, test_features, train_target, test_target = train_test_split(df[features], df[target], test_size=0.3, stratify=df[target])

In [6]:
print(f'Training set shape: {training_features.shape}')
print(f'Training set target shape: {train_target.shape}')
print(f'Test set shape: {test_features.shape}')
print(f'Training set target shape: {test_target.shape}')

Training set shape: (496662, 367)
Training set target shape: (496662,)
Test set shape: (212856, 367)
Training set target shape: (212856,)


In [7]:
test_target.value_counts()

False    212837
True         19
Name: TLJYWBE, dtype: int64

## Modeling

In [24]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RepeatedStratifiedKFold

In [15]:
dtypes = training_features.dtypes
dtypes.value_counts()

float64    741
int64       21
object       4
dtype: int64

In [16]:
int_features = dtypes[dtypes == 'int64'].index
float_features = dtypes[dtypes == 'float64'].index
cat_features = dtypes[dtypes == 'object'].index
len(int_features), len(float_features), len(cat_features)

(21, 741, 4)

In [17]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('int', numeric_transformer, int_features),
        ('float', numeric_transformer, float_features),
        ('cat', categorical_transformer, cat_features)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', 'passthrough')  
                          ])

## Balanced Random Forest

In [27]:
param_grid = [
    {
        'classifier': [BalancedRandomForestClassifier()],
        'classifier__max_depth': [None, 10, 20],
        'classifier__n_estimators': [100, 150, 200],
        'classifier__min_samples_leaf': [2, 4, 8],
    }
]

In [28]:
from src.custom_metrics import precision_at_full_recall, precision_at_full_recall_scorer

In [None]:
%%time
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)
search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=precision_at_full_recall_scorer, n_jobs=-1)
search.fit(training_features, train_target)

In [None]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

## Fitting with resampled data

In [None]:
import numpy as np

# Load the resampled training data
X_resampled = np.load('X_resampled.npy')
y_resampled = np.load('y_resampled.npy')

# Load the transformed test data
X_test_transformed = np.load('X_test_transformed.npy')
test_target = np.load('test_target.npy')

# Verify shapes:
print(f"Loaded resampled training set shape: {X_resampled.shape}")
print(f"Loaded resampled target shape: {y_resampled.shape}")
print(f"Loaded transformed test set shape: {X_test_transformed.shape}")
print(f"Loaded test target shape: {test_target.shape}")


In [27]:
param_grid = [
    {
        'classifier': [BalancedRandomForestClassifier()],
        'classifier__max_depth': [None, 10, 20],
        'classifier__n_estimators': [100, 150, 200],
        'classifier__min_samples_leaf': [2, 4, 8],
    }
]

In [None]:
%%time
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)
search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=precision_at_full_recall_scorer, n_jobs=-1)
search.fit(X_resampled, y_resampled)

In [None]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

### Test model

In [None]:
test_predicted_probs = search.best_estimator.predict_proba(X_test_transformed)[:,1]

In [None]:
precision_at_full_recall(test_target, test_predicted_probs, return_thd=True)