# 04 - High cardinality management: Frequency encoding  

Technique for managing high cardinality categorical features by replacing each category with its frequency (the number of occurrences)

In [4]:
import pandas as pd

df = pd.read_csv("Numeric data (last year work).csv")

# Step 1: Calculate the frequency of each category
frequency = df["Duration"].value_counts()

# Step 2: Map the frequency back to the dataset
df["Duration_Freq"] = df["Duration"].map(frequency)

# Display the updated DataFrame
print("DataFrame with Frequency Encoding:")
print(df)


DataFrame with Frequency Encoding:
      Rating     Votes  Meta Score  Year  Duration  Action  Adventure  \
0        6.5   90000.0        67.0  2023       138       0          0   
1        7.4   24000.0        66.0  2023       116       0          1   
2        7.0   21000.0        85.0  2023       117       0          0   
3        7.1   56000.0        54.0  2023       157       1          1   
4        6.6   66000.0        64.0  2023       158       1          1   
...      ...       ...         ...   ...       ...     ...        ...   
1126     6.3   22000.0        67.0  2021       141       0          0   
1127     7.1  172000.0        59.0  2008       112       0          0   
1128     7.6  198000.0        79.0  1986        96       0          0   
1129     6.5   71000.0        46.0  1992       104       0          0   
1130     7.1  203000.0        65.0  2017       115       1          0   

      Animation  Biography  Comedy  ...  PG Rating_PG  PG Rating_PG-13  \
0             

In [6]:
# let's create a linear regression model
# and see if frequency encoding helped to improve metrics

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Step 1: Prepare the dataset
# Drop any unnecessary columns (if needed)
# Assume 'Rating' is the target variable
target = 'Rating'  # Replace with the name of your target column
features = df.drop(columns=[target], errors='ignore')  # Drop target column to use the rest as features
X = features
y = df[target]

# Handle missing values (if any) by filling with median values
X = X.fillna(X.median())

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train a Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 4: Make Predictions
y_pred = model.predict(X_test)

# Step 5: Evaluate the Model
print("\nModel Performance:")
print("MAE:", round(mean_absolute_error(y_test, y_pred), 2))
print("MSE:", round(mean_squared_error(y_test, y_pred), 2))
print("RMSE:", round(np.sqrt(mean_squared_error(y_test, y_pred)), 2))
print("R-squared:", round(r2_score(y_test, y_pred), 2))


# original metrics:
# MAE 0.3
# MSE 0.15 ^2
# RMSE: 0.38
# R-squared: 0.44

# the encoding tool did not improved metrics
# moreover, made it even worse
# probably, because the year has many variables
# and too many variabled were enconded


Model Performance:
MAE: 0.29
MSE: 0.13
RMSE: 0.37
R-squared: 0.5
