### Handling Imbalanced Data
- Aka, data with a strong bias
- Has to be handled so AI or statistics aren't biased too

#### Resampling (Over/Under-sampling)
- Emphasizes the data that was biased against or downplays the data that was biased towards

In [1]:
from sklearn.datasets import make_classification

X, y = make_classification(
    n_classes=2,
    class_sep=2,
    weights=[0.1, 0.9],
    n_informative=3,
    n_redundant=1,
    flip_y=0,
    n_features=20,
    n_clusters_per_class=1,
    n_samples=1000,
    random_state=42,
)
print("X", X)
print("y", y)

X [[-2.59816534e-01 -3.12491348e-01 -1.31384594e+00 ...  2.03799964e+00
   1.24989650e+00  5.25719806e-01]
 [ 9.92252126e-01  5.61855469e-04 -1.29784421e-01 ...  1.76895576e+00
   6.10236006e-01  1.20070775e+00]
 [-2.61300111e-01  4.15475152e-01 -1.51853634e+00 ...  2.25542565e+00
  -5.93674512e-02 -2.46197971e+00]
 ...
 [-9.74790878e-01 -1.35564930e+00 -2.25150653e+00 ...  2.21028481e+00
  -1.07223595e+00 -1.26660882e-01]
 [ 1.74488199e-01 -7.29036428e-01 -9.48593005e-01 ...  1.46723474e+00
   4.37822879e-01 -9.63760598e-01]
 [ 6.22602338e-01 -1.03537589e+00 -3.23606081e+00 ...  1.73916950e+00
   1.15481512e+00  2.59338575e+00]]
y [1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0

In [2]:
from collections import Counter

print("Distribution", Counter(y))

Distribution Counter({np.int64(1): 900, np.int64(0): 100})


In [22]:
%conda install -c conda-forge imbalanced-learn

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: c:\Users\paul\AppData\anaconda3\envs\g4g-ds

  added / updated specs:
    - imbalanced-learn


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2025.4.26  |       h4c7d964_0         149 KB  conda-forge
    certifi-2025.4.26          |     pyhd8ed1ab_0         154 KB  conda-forge
    imbalanced-learn-0.13.0    |     pyhd8ed1ab_0         141 KB  conda-forge
    sklearn-compat-0.1.3       |     pyhd8ed1ab_0          22 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         466 KB

The following NEW packages will be INSTALLED:

  imbalanced-learn   conda-forge/noarch::imbalanced-learn-0.13.0-pyhd8ed1ab_0 
  sklearn-compat     conda-forge/no

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): conda.anaconda.org:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): repo.anaconda.com:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): repo.anaconda.com:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): repo.anaconda.com:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): conda.anaconda.org:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): repo.anaconda.com:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): repo.anaconda.com:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): repo.anaconda.com:443
DEBUG:urllib3.connectionpool:https://repo.anaconda.com:443 "GET /pkgs/r/win-64/current_repodata.json HTTP/1.1" 304 0
DEBUG:urllib3.connectionpool:https://repo.anaconda.com:443 "GET /pkgs/msys2/noarch/current_repodata.json HTTP/1.1" 304 0
DEBUG:urllib3.connectionpool:https://repo.anaconda.com:443 "GET /pkgs/r/

In [23]:
from imblearn.over_sampling import RandomOverSampler

oversample = RandomOverSampler(sampling_strategy="minority")
X_over, y_over = oversample.fit_resample(X, y)
print("Oversampled", Counter(y_over))

Oversampled Counter({np.int64(1): 900, np.int64(0): 900})


In [24]:
from imblearn.under_sampling import RandomUnderSampler

undersample = RandomUnderSampler(sampling_strategy="majority")
X_under, y_under = undersample.fit_resample(X, y)
print("Undersampled", Counter(y_under))

Undersampled Counter({np.int64(0): 100, np.int64(1): 100})


#### BalancedBaggingClassifier
- Auto-resamples to avoid bias

In [25]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(
    n_classes=2,
    class_sep=2,
    weights=[0.1, 0.9],
    n_informative=3,
    n_redundant=1,
    flip_y=0,
    n_features=20,
    n_clusters_per_class=1,
    n_samples=1000,
    random_state=42,
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [26]:
from sklearn.ensemble import RandomForestClassifier

base_classifier = RandomForestClassifier(random_state=42)
base_classifier

In [27]:
from imblearn.ensemble import BalancedBaggingClassifier

balanced_bagging_classifier = BalancedBaggingClassifier(
    base_classifier,
    sampling_strategy="auto",  # You can adjust this parameter
    replacement=False,  # Whether to sample with or without replacement
    random_state=42,
)
balanced_bagging_classifier

In [28]:
balanced_bagging_classifier.fit(X_train, y_train)
y_pred = balanced_bagging_classifier.predict(X_test)
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       1, 1])

In [29]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

1.0

In [30]:
from sklearn.metrics import classification_report

classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00        13\n           1       1.00      1.00      1.00       187\n\n    accuracy                           1.00       200\n   macro avg       1.00      1.00      1.00       200\nweighted avg       1.00      1.00      1.00       200\n'

#### Synthetic Minority Oversampling Technique (SMOTE)
- Synthetically generates artificial data for the minority attributes, instead of simply suplicating records

In [31]:
from sklearn.datasets import make_classification

X, y = make_classification(
    n_classes=2,
    class_sep=2,
    weights=[0.1, 0.9],
    n_informative=3,
    n_redundant=1,
    flip_y=0,
    n_features=20,
    n_clusters_per_class=1,
    n_samples=1000,
    random_state=42,
)

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [33]:
from collections import Counter

Counter(y_train)

Counter({np.int64(1): 713, np.int64(0): 87})

In [34]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy="auto", random_state=42)
smote

In [35]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
Counter(y_train_resampled)

Counter({np.int64(1): 713, np.int64(0): 713})

#### Threshold Moving
- Determines probabilities of calss membership
- Conventionally 0.5, but in cases of bias, it can be changed

In [36]:
from sklearn.datasets import make_classification

X, y = make_classification(
    n_classes=2,
    class_sep=2,
    weights=[0.1, 0.9],
    n_informative=3,
    n_redundant=1,
    flip_y=0,
    n_features=20,
    n_clusters_per_class=1,
    n_samples=1000,
    random_state=42,
)

In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [38]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model

In [39]:
model.fit(X_train, y_train)
y_proba = model.predict_proba(X_test)[:, 1]
y_proba

array([1.  , 0.97, 1.  , 0.99, 1.  , 0.99, 0.99, 0.99, 0.96, 0.99, 0.99,
       1.  , 1.  , 0.98, 1.  , 0.95, 1.  , 0.99, 0.98, 1.  , 1.  , 1.  ,
       0.87, 1.  , 0.99, 0.96, 1.  , 0.99, 0.99, 1.  , 1.  , 1.  , 1.  ,
       0.95, 0.94, 0.99, 0.99, 0.99, 0.98, 0.29, 1.  , 0.97, 0.95, 0.95,
       0.95, 0.97, 1.  , 1.  , 0.96, 0.96, 0.9 , 0.99, 1.  , 1.  , 0.96,
       0.99, 1.  , 0.88, 0.99, 0.98, 0.99, 0.99, 0.09, 1.  , 1.  , 1.  ,
       1.  , 1.  , 1.  , 0.99, 1.  , 0.99, 1.  , 1.  , 1.  , 0.98, 0.97,
       1.  , 0.99, 0.99, 1.  , 0.99, 0.99, 0.98, 0.11, 0.98, 0.98, 0.98,
       0.99, 0.99, 0.97, 0.99, 0.97, 1.  , 0.13, 0.99, 0.97, 0.98, 0.99,
       1.  , 0.98, 1.  , 0.91, 1.  , 0.98, 1.  , 1.  , 0.92, 1.  , 0.98,
       0.08, 0.97, 0.96, 0.99, 1.  , 0.99, 0.99, 0.94, 0.98, 0.99, 1.  ,
       1.  , 0.88, 1.  , 0.97, 1.  , 1.  , 1.  , 0.98, 0.98, 0.97, 0.98,
       1.  , 0.07, 0.11, 0.99, 0.99, 1.  , 0.99, 0.97, 1.  , 1.  , 1.  ,
       0.99, 0.99, 0.99, 0.98, 0.99, 0.99, 0.98, 1.

In [40]:
from sklearn.metrics import f1_score

threshold = 0.5

# Adjust threshold based on your criteria (e.g., maximizing F1-score)
while threshold >= 0:
    y_pred = (y_proba >= threshold).astype(int)
    f1 = f1_score(y_test, y_pred)

    print(f"Threshold: {threshold:.2f}, F1 Score: {f1:.4f}")

    # Move the threshold (you can customize the step size)
    threshold -= 0.02

Threshold: 0.50, F1 Score: 1.0000
Threshold: 0.48, F1 Score: 1.0000
Threshold: 0.46, F1 Score: 1.0000
Threshold: 0.44, F1 Score: 1.0000
Threshold: 0.42, F1 Score: 1.0000
Threshold: 0.40, F1 Score: 1.0000
Threshold: 0.38, F1 Score: 1.0000
Threshold: 0.36, F1 Score: 1.0000
Threshold: 0.34, F1 Score: 1.0000
Threshold: 0.32, F1 Score: 1.0000
Threshold: 0.30, F1 Score: 1.0000
Threshold: 0.28, F1 Score: 0.9973
Threshold: 0.26, F1 Score: 0.9973
Threshold: 0.24, F1 Score: 0.9973
Threshold: 0.22, F1 Score: 0.9947
Threshold: 0.20, F1 Score: 0.9947
Threshold: 0.18, F1 Score: 0.9947
Threshold: 0.16, F1 Score: 0.9920
Threshold: 0.14, F1 Score: 0.9920
Threshold: 0.12, F1 Score: 0.9894
Threshold: 0.10, F1 Score: 0.9842
Threshold: 0.08, F1 Score: 0.9740
Threshold: 0.06, F1 Score: 0.9664
Threshold: 0.04, F1 Score: 0.9664
Threshold: 0.02, F1 Score: 0.9664
