# 04 - High cardinality management: Grouping rare categories together

Grouping rare categories involves combining infrequent categories into a single "Other" category or a similar label. This helps reduce noise and improve model generalization, especially when dealing with high-cardinality categorical features.

*very interesting tool, 
I am excited to try it our

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [6]:
# Load dataset
df = pd.read_csv("bank_numeric.csv")

# Define features and target
target_column = "deposit"
X = df.drop(columns=[target_column])
y = df[target_column]

In [7]:
# I do not have categorical columns
# that is why I create them here in order to implement this tool

# Identify Categorical Columns Dynamically
# A column is considered categorical if it has fewer than or equal to 10 unique values
CATEGORICAL_THRESHOLD = 10
categorical_columns = [col for col in X.columns if X[col].nunique() <= CATEGORICAL_THRESHOLD]
print("\nCategorical Columns for Grouping Rare Categories:", categorical_columns)

# Threshold for rare categories (frequency below this is considered rare)
RARE_THRESHOLD = 0.05  # 5% of total rows



Categorical Columns for Grouping Rare Categories: ['marital', 'education', 'default', 'housing', 'loan', 'contact', 'previous', 'poutcome']


In [None]:
# Group Rare Categories
def group_rare_categories(df, column, threshold):
    # Calculate category frequencies
    category_frequencies = df[column].value_counts(normalize=True)
    
    # Identify rare categories
    rare_categories = category_frequencies[category_frequencies < threshold].index
    
    # Replace rare categories with "Other"
    df[column] = df[column].apply(lambda x: "Other" if x in rare_categories else x)
    return df

# Apply grouping to all identified categorical columns
for col in categorical_columns:
    X = group_rare_categories(X, col, RARE_THRESHOLD)

# Check the result of grouping
print("\nSample Categorical Column After Grouping Rare Categories:\n")
print(X[categorical_columns].head())

# I understood a small issue, that
# the code was supposed to replace
# string and other types of format
#  with numeric codes
# but I already had numeric data
# that is why I will load now uncleaned data
# and try again to process everything




Sample Categorical Column After Grouping Rare Categories:

   marital education default  housing  loan contact previous  poutcome
0        1         1       0        1     0       2        0         3
1        2         2       0        1     1       2        0         3
2        1         1       0        1     0       2        0         3
3        2         1       0        1     0       2        0         3
4        1         1       0        1     0       2        0         3


In [9]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
# One-Hot Encoding (for Logistic Regression)
X_train_encoded = pd.get_dummies(X_train, columns=categorical_columns, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_columns, drop_first=True)

# Ensure the test set has the same columns as the training set
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)



In [11]:
# Train Logistic Regression Model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_encoded, y_train)

# Predict and Evaluate
y_pred = log_reg.predict(X_test_encoded)
print("\nMetrics After Grouping Rare Categories:")
print(classification_report(y_test, y_pred))
print("\nModel overall accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))



Metrics After Grouping Rare Categories:
              precision    recall  f1-score   support

           0       0.80      0.89      0.84       915
           1       0.82      0.69      0.75       663

    accuracy                           0.81      1578
   macro avg       0.81      0.79      0.80      1578
weighted avg       0.81      0.81      0.81      1578


Model overall accuracy: 80.80%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# AGAIN
# but with the uncleaned dataset

# Load dataset
df = pd.read_csv("bank.csv")

# Define features and target
target_column = "deposit"
X = df.drop(columns=[target_column])
y = df[target_column]

# Identify high-cardinality categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns
print("\nCategorical Columns for Grouping Rare Categories:", categorical_columns.tolist())

# Threshold for rare categories (frequency below this is considered rare)
RARE_THRESHOLD = 0.05  # 5% of total rows

# Step 1: Group Rare Categories
def group_rare_categories(df, column, threshold):
    # Calculate category frequencies
    category_frequencies = df[column].value_counts(normalize=True)
    
    # Identify rare categories
    rare_categories = category_frequencies[category_frequencies < threshold].index
    
    # Replace rare categories with "Other"
    df[column] = df[column].apply(lambda x: "Other" if x in rare_categories else x)
    return df

# Apply grouping to all categorical columns
for col in categorical_columns:
    X = group_rare_categories(X, col, RARE_THRESHOLD)

# Check the result of grouping
print("\nSample Categorical Column After Grouping Rare Categories:")
print(X[categorical_columns].head())



Categorical Columns for Grouping Rare Categories: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

Sample Categorical Column After Grouping Rare Categories:
          job  marital  education default housing loan  contact month poutcome
0      admin.  married  secondary      no     yes   no  unknown   may  unknown
1      admin.  married  secondary      no      no   no  unknown   may  unknown
2  technician  married  secondary      no     yes   no  unknown   may  unknown
3    services  married  secondary      no     yes   no  unknown   may  unknown
4      admin.  married   tertiary      no      no   no  unknown   may  unknown


In [None]:
# to be honest, I do not see any changes
# that is why I decided to verify the grouping worked

# let's check the unique values of each column 
# and look for the "Other" label
for col in categorical_columns:
    print(f"Unique values in {col}: {X[col].unique()}")

# "Other" should appear in columns where rare categories were grouped

Unique values in job: ['admin.' 'technician' 'services' 'management' 'retired' 'blue-collar'
 'Other']
Unique values in marital: ['married' 'single' 'divorced']
Unique values in education: ['secondary' 'tertiary' 'primary' 'Other']
Unique values in default: ['no' 'Other']
Unique values in housing: ['yes' 'no']
Unique values in loan: ['no' 'yes']
Unique values in contact: ['unknown' 'cellular' 'telephone']
Unique values in month: ['may' 'jun' 'jul' 'aug' 'Other' 'nov' 'feb' 'apr']
Unique values in poutcome: ['unknown' 'Other' 'failure' 'success']


In [17]:
# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: One-Hot Encoding (for Logistic Regression)
X_train_encoded = pd.get_dummies(X_train, columns=categorical_columns, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_columns, drop_first=True)

# Ensure the test set has the same columns as the training set
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Step 4: Train Logistic Regression Model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_encoded, y_train)

# Predict and Evaluate
y_pred = log_reg.predict(X_test_encoded)
print("\nMetrics After Grouping Rare Categories:")
print(classification_report(y_test, y_pred))
print("\nModel overall accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))


Metrics After Grouping Rare Categories:
              precision    recall  f1-score   support

          no       0.80      0.84      0.82      1742
         yes       0.82      0.77      0.79      1607

    accuracy                           0.81      3349
   macro avg       0.81      0.80      0.80      3349
weighted avg       0.81      0.81      0.80      3349


Model overall accuracy: 80.50%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# outcome:
# I tried this tool and the metric did not changed at all