# 04 - High cardinality management: Feature hashing 

Feature Hashing, also known as the "hashing trick," is an efficient way to handle high-cardinality categorical features. It maps categories to a fixed number of bins using a hash function, reducing dimensionality without explicitly creating columns for each unique category.

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import FeatureHasher
from sklearn.metrics import accuracy_score, classification_report

In [26]:
# Load dataset
df = pd.read_csv("bank_numeric.csv")

# Define features and target
target_column = "deposit"
X = df.drop(columns=[target_column])
y = df[target_column]

In [27]:
# Identify high-cardinality categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns
print("\nCategorical Columns for Feature Hashing:", categorical_columns.tolist())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# the code is attempting to identify categorical columns
# but there are no columns in the dataset 
# have the data type object


Categorical Columns for Feature Hashing: []


In [28]:
# Identify columns with a small number of unique values
# threshold is 10
potential_categorical_columns = [col for col in X.columns if X[col].nunique() <= 10]
print("Potential Categorical Columns:", potential_categorical_columns)


Potential Categorical Columns: ['marital', 'education', 'default', 'housing', 'loan', 'contact', 'previous', 'poutcome']


In [29]:
# Convert identified columns to object type
categorical_columns = ['marital', 'education', 'default', 'housing', 'loan', 'contact', 'previous', 'poutcome']
X[categorical_columns] = X[categorical_columns].astype('object')

# Confirm the changes
print(X.dtypes[categorical_columns])


marital      object
education    object
default      object
housing      object
loan         object
contact      object
previous     object
poutcome     object
dtype: object


In [30]:
# Step 1: Logistic Regression BEFORE Feature Hashing
# For simplicity, drop the high-cardinality categorical columns
X_train_no_hash = X_train.drop(columns=categorical_columns)
X_test_no_hash = X_test.drop(columns=categorical_columns)

# Train logistic regression on the dataset without hashing
log_reg_no_hash = LogisticRegression(max_iter=1000, random_state=42)
log_reg_no_hash.fit(X_train_no_hash, y_train)

# Predict and evaluate metrics
y_pred_no_hash = log_reg_no_hash.predict(X_test_no_hash)
print("\nMetrics Before Feature Hashing:")
print(classification_report(y_test, y_pred_no_hash))
acc_no_hash = accuracy_score(y_test, y_pred_no_hash)
print("\nModel overall accuracy (Before Feature Hashing): {:.2f}%".format(acc_no_hash * 100))



Metrics Before Feature Hashing:
              precision    recall  f1-score   support

           0       0.78      0.90      0.84       915
           1       0.83      0.65      0.73       663

    accuracy                           0.80      1578
   macro avg       0.81      0.78      0.78      1578
weighted avg       0.80      0.80      0.79      1578


Model overall accuracy (Before Feature Hashing): 79.78%


In [31]:
# Step 2: Apply Feature Hashing
def apply_feature_hashing(df, columns, n_features):
    hasher = FeatureHasher(n_features=n_features, input_type='string')
    
    # Combine categorical columns row-wise (iterable of iterables of strings)
    combined_categorical = df[columns].astype(str).values.tolist()
    
    # Apply FeatureHasher to the combined categorical columns
    hashed_features = hasher.transform(combined_categorical).toarray()
    
    # Return hashed features as a DataFrame
    hashed_df = pd.DataFrame(hashed_features, columns=[f"hash_{i}" for i in range(n_features)])
    
    # Concatenate hashed features with remaining numeric columns
    return pd.concat([df.drop(columns=columns), hashed_df], axis=1)

# Number of hashed features
n_hash_features = 10

# Apply hashing to training and test sets
X_train_hashed = apply_feature_hashing(X_train, categorical_columns, n_hash_features)
X_test_hashed = apply_feature_hashing(X_test, categorical_columns, n_hash_features)

print("\nDataset Shape After Feature Hashing:", X_train_hashed.shape)


Dataset Shape After Feature Hashing: (4781, 18)


In [33]:
# Drop rows with NaN values
X_train_hashed = X_train_hashed.dropna()
y_train = y_train[X_train_hashed.index]  # Ensure target matches after dropping rows

X_test_hashed = X_test_hashed.dropna()
y_test = y_test[X_test_hashed.index]  # Ensure target matches after dropping rows


In [34]:
# Step 3: Logistic Regression AFTER Feature Hashing
log_reg_hashed = LogisticRegression(max_iter=1000, random_state=42)
log_reg_hashed.fit(X_train_hashed, y_train)

# Predict and evaluate metrics
y_pred_hashed = log_reg_hashed.predict(X_test_hashed)
print("\nMetrics After Feature Hashing:")
print(classification_report(y_test, y_pred_hashed))
acc_hashed = accuracy_score(y_test, y_pred_hashed)
print("\nModel overall accuracy (After Feature Hashing): {:.2f}%".format(acc_hashed * 100))


Metrics After Feature Hashing:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.78      0.88       504

    accuracy                           0.78       504
   macro avg       0.50      0.39      0.44       504
weighted avg       1.00      0.78      0.88       504


Model overall accuracy (After Feature Hashing): 78.17%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# insights:

# first of all,
# I had to change the types of columns
# because I did not have objects for hashing

# secondly,
# after hashing I had some Nan values
# which I had to delete 

# thirdly,
# this technique did not improve metrics
# even made it worse in my case