In [5]:
import pandas as pd

In [6]:
df = pd.read_csv('../../../datasets/heart-disease.csv')

In [7]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


# Ordered Target Encoding

### Example Scenario
We have a dataset containing information about soccer matches. The dataset includes:
- `Team`: The name of the team playing in the match.
- `Goals`: The number of goals scored by the team in that match.

Our goal is to transform the categorical feature `Team` into a numeric feature using **Ordered Target Encoding**, which captures the relationship between the team and the number of goals scored.

In [1]:
import pandas as pd

data = {
    'Match': [1, 2, 3, 4, 5, 6, 7, 5],
    'Team': ['Manchester', 'Barca', 'Manchester', 'Real', 'Barca', 'Manchester', 'Real', 'Manchester'],
    'Goals': [3, 1, 4, 5, 2, 2, 3, 1]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Match,Team,Goals
0,1,Manchester,3
1,2,Barca,1
2,3,Manchester,4
3,4,Real,5
4,5,Barca,2
5,6,Manchester,2
6,7,Real,3
7,5,Manchester,1


In [4]:
df['Encoded_Team'] = ordered_target_encoding(df, cat_column='Team', target_column='Goals')
df

Unnamed: 0,Match,Team,Goals,Encoded_Team
0,1,Manchester,3,0.0
1,2,Barca,1,0.0
2,3,Manchester,4,3.0
3,4,Real,5,0.0
4,5,Barca,2,1.0
5,6,Manchester,2,3.5
6,7,Real,3,5.0
7,5,Manchester,1,3.0


### Step 1: Sequential Processing
We will process the rows in order and calculate the encoded value for each `Team` based on the average goals scored in all **preceding rows** by the same team. We will avoid including the current match's goals to prevent data leakage.

In [2]:
def ordered_target_encoding(df, cat_column, target_column, prior_mean=0):
    encoded_values = []
    category_stats = {}

    for index, row in df.iterrows():
        category = row[cat_column]
        target_value = row[target_column]

        if category in category_stats:
            prior_mean_for_category = category_stats[category]['sum'] / category_stats[category]['count']
        else:
            prior_mean_for_category = prior_mean

        encoded_values.append(prior_mean_for_category)

        if category not in category_stats:
            category_stats[category] = {'sum': target_value, 'count': 1}
        else:
            category_stats[category]['sum'] += target_value
            category_stats[category]['count'] += 1

    return encoded_values

In [3]:
df['Encoded_Team'] = ordered_target_encoding(df, cat_column='Team', target_column='Goals')
df

Unnamed: 0,Match,Team,Goals,Encoded_Team
0,1,Manchester,3,0.0
1,2,Barca,1,0.0
2,3,Manchester,4,3.0
3,4,Real,5,0.0
4,5,Barca,2,1.0
5,6,Manchester,2,3.5
6,7,Real,3,5.0
7,5,Manchester,1,3.0


# Ordered Boosting

In XGBoost, everytime we try to predict the residuals we use the entire dataset. 

CatBoost uses ordered boosting which means everytime it tries to predict a data point, it only uses data previous to that data point. This helps mitigate overfitting, especially in small datasets. 