# Up & Downsampling

In [36]:
import pandas as pd
import numpy as np

## Dataset

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
df = pd.read_feather('dataset_no_duplicates_no_univalue_no_correlated_columns.feather')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 709518 entries, 0 to 709517
Columns: 368 entries, HKTLMYY to TLJYWBE
dtypes: float64(347), int64(21)
memory usage: 1.9 GB


In [39]:
target = 'TLJYWBE'
df[target] = df[target].apply(lambda v: v >= 1e-5)
df[target].value_counts()

TLJYWBE
False    709454
True         64
Name: count, dtype: int64

In [40]:
features = [c for c in df.columns if c != target]
training_features, test_features, train_target, test_target = train_test_split(df[features], df[target], test_size=0.3, stratify=df[target])

In [41]:
print(f'Training set shape: {training_features.shape}')
print(f'Training set target shape: {train_target.shape}')
print(f'Test set shape: {test_features.shape}')
print(f'Training set target shape: {test_target.shape}')

Training set shape: (496662, 367)
Training set target shape: (496662,)
Test set shape: (212856, 367)
Training set target shape: (212856,)


In [42]:
test_target.value_counts()

TLJYWBE
False    212837
True         19
Name: count, dtype: int64

## SMOTE + TomekLink

In [43]:
from imblearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

In [44]:
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler

### Pipeline

In [45]:
dtypes = training_features.dtypes
dtypes.value_counts()

float64    346
int64       21
Name: count, dtype: int64

In [46]:
int_features = dtypes[dtypes == 'int64'].index
float_features = dtypes[dtypes == 'float64'].index
cat_features = dtypes[dtypes == 'object'].index
len(int_features), len(float_features), len(cat_features)

(21, 346, 0)

In [47]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('int', numeric_transformer, int_features),
        ('float', numeric_transformer, float_features),
        ('cat', categorical_transformer, cat_features)
    ])

smote = SMOTE(sampling_strategy=0.2, k_neighbors=5)
tomek = TomekLinks(sampling_strategy='majority')
downsampler = RandomUnderSampler(sampling_strategy=0.5)


# Create and evaluate the pipeline
resampling_pipeline = ImbPipeline(steps=[('preprocessor', preprocessor),
                              ('smote', smote),
                              ('tomek', tomek)])

## Transforming and resampling the training set

In [48]:
%%time
X_resampled, y_resampled = resampling_pipeline.fit_resample(training_features, train_target)

CPU times: user 1h 41min 42s, sys: 4.42 s, total: 1h 41min 46s
Wall time: 13min 21s


In [49]:
print(f"Resampled training set shape: {X_resampled.shape}")
print(f"Resampled target shape: {y_resampled.shape}")

Resampled training set shape: (595940, 367)
Resampled target shape: (595940,)


In [50]:
# Save the resampled data
np.save('X_resampled.npy', X_resampled)
np.save('y_resampled.npy', y_resampled)

## Transforming the test set

In [51]:
X_test_transformed = resampling_pipeline.named_steps['preprocessor'].transform(test_features)

In [52]:
print(f"Transformed test set shape: {X_test_transformed.shape}")
print(f"Test target shape: {test_target.shape}")

Transformed test set shape: (212856, 367)
Test target shape: (212856,)


In [53]:
# Save the transformed test data
np.save('X_test_transformed.npy', X_test_transformed)
np.save('test_target.npy', test_target)