# Modeling

In [1]:
import pandas as pd
import numpy as np

In [2]:
import cufflinks as cf; cf.go_offline()

## Dataset

In [3]:
df = pd.read_feather('removed_duplicated_columns.feather')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 726288 entries, 0 to 726287
Columns: 767 entries, HKTLMYY to TLJYWBE
dtypes: float64(742), int64(21), object(4)
memory usage: 4.2+ GB


In [4]:
target = 'TLJYWBE'
df[target] = df[target].apply(lambda v: v >= 1e-5)
df[target].value_counts()

False    726224
True         64
Name: TLJYWBE, dtype: int64

In [43]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy=0.001)  
X_resampled, _ = rus.fit_resample(df, df[target])

In [44]:
X_resampled[target].value_counts()

False    64000
True        64
Name: TLJYWBE, dtype: int64

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
features = [c for c in df.columns if c != target]
training_features, test_features, train_target, test_target = train_test_split(X_resampled[features], X_resampled[target], test_size=0.25, stratify=X_resampled[target])

In [47]:
print(f'Training set shape: {training_features.shape}')
print(f'Training set target shape: {train_target.shape}')
print(f'Test set shape: {test_features.shape}')
print(f'Training set target shape: {test_target.shape}')

Training set shape: (48048, 766)
Training set target shape: (48048,)
Test set shape: (16016, 766)
Training set target shape: (16016,)


In [48]:
test_target.value_counts()

False    16000
True        16
Name: TLJYWBE, dtype: int64

## Modeling

In [49]:
dtypes = df.dtypes
dtypes.head()

HKTLMYY      int64
IJEXXXL    float64
KVNLYTZ    float64
ADOIOYN    float64
GPCKISJ    float64
dtype: object

In [50]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [51]:
int_features = dtypes[dtypes == 'int64'].index
float_features = dtypes[dtypes == 'float64'].index
cat_features = dtypes[dtypes == 'object'].index
len(int_features), len(float_features), len(cat_features)

(21, 741, 4)

In [52]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('int', numeric_transformer, int_features),
        ('float', numeric_transformer, float_features),
        ('cat', categorical_transformer, cat_features)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', 'passthrough')  
                          ])

In [54]:
param_grid = [
    {
        'classifier': [RidgeClassifier()],
        'classifier__alpha': [100, 150, 200],  
        'classifier__class_weight': [{0: 1, 1: w} for w in [75, 100, 150, 200]]
    },
    {
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__class_weight': [{0: 1, 1: w} for w in [75, 100, 150, 200]]  # Updated this line
    }
]

In [55]:
%%time
search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=2)
search.fit(training_features, train_target)

CPU times: user 29.7 s, sys: 4.52 s, total: 34.2 s
Wall time: 1h 48min 11s


In [56]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.204):
{'classifier': RandomForestClassifier(class_weight={0: 1, 1: 75}, min_samples_leaf=4,
                       min_samples_split=10), 'classifier__class_weight': {0: 1, 1: 75}, 'classifier__max_depth': None, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}


In [64]:
import pandas as pd
pd.set_option('display.max_colwidth', 400)
results_df = pd.DataFrame(search.cv_results_)
top_results = results_df.sort_values(by='mean_test_score', ascending=False)
top_results[['params', 'mean_test_score', 'std_test_score']].head(10)

Unnamed: 0,params,mean_test_score,std_test_score
37,"{'classifier': RandomForestClassifier(class_weight={0: 1, 1: 75}, min_samples_leaf=4,  min_samples_split=10), 'classifier__class_weight': {0: 1, 1: 75}, 'classifier__max_depth': None, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}",0.203785,0.070294
90,"{'classifier': RandomForestClassifier(class_weight={0: 1, 1: 75}, min_samples_leaf=4,  min_samples_split=10), 'classifier__class_weight': {0: 1, 1: 150}, 'classifier__max_depth': None, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 50}",0.170452,0.108905
36,"{'classifier': RandomForestClassifier(class_weight={0: 1, 1: 75}, min_samples_leaf=4,  min_samples_split=10), 'classifier__class_weight': {0: 1, 1: 75}, 'classifier__max_depth': None, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 50}",0.168571,0.106972
8,"{'classifier': RidgeClassifier(), 'classifier__alpha': 200, 'classifier__class_weight': {0: 1, 1: 75}}",0.168087,0.06925
45,"{'classifier': RandomForestClassifier(class_weight={0: 1, 1: 75}, min_samples_leaf=4,  min_samples_split=10), 'classifier__class_weight': {0: 1, 1: 100}, 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 50}",0.159341,0.105847
33,"{'classifier': RandomForestClassifier(class_weight={0: 1, 1: 75}, min_samples_leaf=4,  min_samples_split=10), 'classifier__class_weight': {0: 1, 1: 75}, 'classifier__max_depth': None, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 50}",0.159341,0.105847
32,"{'classifier': RandomForestClassifier(class_weight={0: 1, 1: 75}, min_samples_leaf=4,  min_samples_split=10), 'classifier__class_weight': {0: 1, 1: 75}, 'classifier__max_depth': None, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}",0.159341,0.105847
27,"{'classifier': RandomForestClassifier(class_weight={0: 1, 1: 75}, min_samples_leaf=4,  min_samples_split=10), 'classifier__class_weight': {0: 1, 1: 75}, 'classifier__max_depth': None, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 50}",0.157436,0.106212
4,"{'classifier': RidgeClassifier(), 'classifier__alpha': 150, 'classifier__class_weight': {0: 1, 1: 75}}",0.153891,0.06752
0,"{'classifier': RidgeClassifier(), 'classifier__alpha': 100, 'classifier__class_weight': {0: 1, 1: 75}}",0.151519,0.063387


## Refiend search

In [None]:
import lightgbm as lgb

In [None]:
param_grid = [
    {
        'classifier': [RandomForestClassifier(), lgb.LGBMClassifier()],
        'classifier__n_estimators': [50, 75, 100, 125],
        'classifier__min_samples_split': [8, 10, 15, 20],
        'classifier__min_samples_leaf': [4, 6, 8],
        'classifier__class_weight': [{0: 1, 1: w} for w in [50, 75, 100, 150]] 
    }
]

In [62]:
search.best_estimator_

In [60]:
from sklearn.metrics import f1_score, precision_score
best_estimator = search.best_estimator_
best_estimator.fit(training_features, train_target)
test_predictions = best_estimator.predict(test_features)
f1 = f1_score(test_target, test_predictions)
precision = precision_score(test_target, test_predictions)
print(f"F1 Score on Test Set: {f1:.3f}")
print(f"Precision on Test Set: {precision:.3f}")

F1 Score on Test Set: 0.000
Precision on Test Set: 0.000



Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



## Feature importance

In [46]:
feature_names = training_features.columns.tolist()
coefficients = best_estimator.named_steps['classifier'].coef_[0]
feature_importance = dict(zip(feature_names, coefficients))
sorted_features = sorted(feature_importance.items(), key=lambda x: abs(x[1]), reverse=True)

for i, (feature, coef) in enumerate(sorted_features):
    print(f"Feature {i}: {feature}, Coefficient: {coef:.4f}")

Feature 0: OKIKPOJ, Coefficient: -0.2588
Feature 1: AEXRRBM, Coefficient: 0.2236
Feature 2: SHEGNUT, Coefficient: 0.1744
Feature 3: FPGUGZC, Coefficient: -0.1654
Feature 4: FFJOGRA, Coefficient: 0.1638
Feature 5: ALVJYOA, Coefficient: 0.1572
Feature 6: FBDGIUT, Coefficient: 0.1496
Feature 7: IJTGMGJ, Coefficient: -0.1423
Feature 8: MUEKGHC, Coefficient: -0.1341
Feature 9: TXEIVQY, Coefficient: -0.1325
Feature 10: OULTOYT, Coefficient: -0.1296
Feature 11: LWFSOBB, Coefficient: -0.1288
Feature 12: GYXMTQR, Coefficient: -0.1274
Feature 13: DVSZBLN, Coefficient: 0.1250
Feature 14: CWWUCQG, Coefficient: 0.1184
Feature 15: FTFSHCI, Coefficient: -0.1149
Feature 16: QNIBUZZ, Coefficient: 0.1137
Feature 17: SRZFXSA, Coefficient: 0.1058
Feature 18: KMGNWGY, Coefficient: -0.1029
Feature 19: AHBBVON, Coefficient: -0.0987
Feature 20: TNLTITB, Coefficient: 0.0985
Feature 21: ZPBDTHO, Coefficient: 0.0971
Feature 22: RRMMRAJ, Coefficient: 0.0920
Feature 23: WVRSHBW, Coefficient: -0.0919
Feature 24: FB

## RFE approach

In [19]:
from sklearn.feature_selection import RFE

In [None]:
# Lists to store results
num_features = []
performances = []

for i in range(1, X_train.shape[1] + 1):
    rfe = RFE(model, i)
    fit = rfe.fit(X_train, y_train)
    
    pred = fit.predict(X_test[:,fit.support_])
    f1 = f1_score(test_target, test_predictions)
    num_features.append(i)
    performances.append(f1)RidgeClassifier