In [90]:
import pandas as pd
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

def check_and_undersample(df, target_col, imbalance_threshold=0.2):
    """
    Checks for target variable imbalance in a DataFrame and performs random undersampling if imbalanced.

    Args:
        df (pd.DataFrame): The input DataFrame.
        target_col (str): The name of the target variable column.
        imbalance_threshold (float): The ratio threshold to define imbalance. 
                                     If minority_count / majority_count < threshold, it's considered imbalanced.
                                     Defaults to 0.2 (20%).

    Returns:
        pd.DataFrame: The original or undersampled DataFrame.
        pd.Series: The original or undersampled target variable Series.
        bool: True if undersampling was performed, False otherwise.
    """
    df_org = df.copy()
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    # Calculate class distribution
    class_counts = Counter(y)
    print("Original class distribution:")
    print(class_counts)

    # Find majority and minority classes
    majority_class_count = max(class_counts.values())
    minority_class_count = min(class_counts.values())

    # Check for imbalance
    is_imbalanced = False
    if majority_class_count > 0: # Avoid division by zero
        imbalance_ratio = minority_class_count / majority_class_count
        print(f"\nImbalance Ratio (Minority/Majority): {imbalance_ratio:.4f}")
        if imbalance_ratio < imbalance_threshold:
            print(f"Data is imbalanced (ratio < {imbalance_threshold}). Performing Random Undersampling.")
            is_imbalanced = True
        else:
            print(f"Data is relatively balanced (ratio >= {imbalance_threshold}). No undersampling needed.")
            return df_org, y, False
    else:
        print("No data or only one class found. Cannot assess imbalance.")
        return df_org, y, False

    # Perform Random Undersampling if imbalanced
    if is_imbalanced:
        rus = RandomUnderSampler(random_state=42) # random_state for reproducibility
        X_resampled, y_resampled = rus.fit_resample(X, y)
        
        print("\nClass distribution after Random Undersampling:")
        print(Counter(y_resampled))
        
        # Combine resampled features and target back into a DataFrame if needed
        # df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name=target_col)], axis=1)

        df_resampled = pd.concat([X_resampled,y_resampled],axis=1)
        
        return df_resampled, y_resampled, True
    
    
    return df_org, y, False # Should not be reached if logic is correct, but included for completeness
        

In [97]:
# --- Example Usage ---
# Create a sample imbalanced DataFrame (replace with your actual data)
data = {
    'feature1': range(100),
    'feature2': range(100, 200),
    'target': [0] * 80 + [1] * 12 + [2] * 8 
}

In [98]:
df_sample = pd.DataFrame(data)
target_column_name = 'target'

# Check for imbalance and undersample if necessary
X_processed, y_processed, was_undersampled = check_and_undersample(df_sample, target_column_name)

if was_undersampled:
    print("\nUndersampling was performed.")
    # Now you can use X_processed and y_processed for model training
    # print("\nUndersampled Features (X_processed):")
    print("The dataframe size is " + str(len(X_processed)))
    print(X_processed.head())
    # print("\nUndersampled Target (y_processed):")
    #print(y_processed.head())
else:
    print("\nNo undersampling was performed.")
    # Use the original X and y
    # print("\nOriginal Features (X_processed):")
    print("The dataframe size is " + str(len(X_processed)))
    print(X_processed.head())
    # print("\nOriginal Target (y_processed):")
    #print(y_processed)

Original class distribution:
Counter({0: 80, 1: 12, 2: 8})

Imbalance Ratio (Minority/Majority): 0.1000
Data is imbalanced (ratio < 0.2). Performing Random Undersampling.

Class distribution after Random Undersampling:
Counter({0: 8, 1: 8, 2: 8})

Undersampling was performed.
The dataframe size is 24
    feature1  feature2  target
30        30       130       0
0          0       100       0
22        22       122       0
31        31       131       0
18        18       118       0


In [99]:
X_processed

Unnamed: 0,feature1,feature2,target
30,30,130,0
0,0,100,0
22,22,122,0
31,31,131,0
18,18,118,0
28,28,128,0
10,10,110,0
70,70,170,0
81,81,181,1
85,85,185,1


In [100]:
max(Counter(y_processed).values())

8