In [5]:
import polars as pl

In [6]:
def compute_numerical_feature_from_categorical(dataset, y, categorical_feature_name, numerical_feature_name):

    feature_mean_value = dataset \
        .group_by(categorical_feature_name) \
            .agg(pl.col(y).mean()) \
        .sort(y)

    feature_mean_value = feature_mean_value.rename({y:numerical_feature_name})

    dataset = dataset.join(feature_mean_value, on=categorical_feature_name, how='left')
    dataset = dataset.drop(categorical_feature_name, numerical_feature_name+'_right')
    return dataset

### Read data

In [7]:
player_df = pl.read_csv('silver\Player.csv', separator=';')
player_team_df = pl.read_csv('silver\PlayerTeam.csv', separator=';')
team_df = pl.read_csv('silver\Team.csv', separator=';')
team_league_df = pl.read_csv('silver\TeamLeague.csv', separator=';')
nationality_df = pl.read_csv('silver\Nationality.csv', separator=';')
foot_df = pl.read_csv('silver\Foot.csv', separator=';')
position_df = pl.read_csv('silver\Position.csv', separator=';')

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 6-7: malformed \N character escape (727476159.py, line 5)

### Join data

In [None]:
dataset = player_df.join(player_team_df, on='PlayerId', how='left')
dataset = dataset \
    .join(team_df, on='TeamId', how='left') \
    .join(team_league_df, on='TeamId', how='left') \
    .join(foot_df, on='FootId', how='left') \
    .join(position_df, on='PositionId', left='left') \
    .join(nationality_df, on='NationalityId', left='left')
    
dataset.head()

In [4]:
dataset = player_df.join(player_team_df, on='PlayerId', how='left')
dataset = dataset \
    .join(team_df, on='TeamId', how='left') \
    .join(team_league_df, on='TeamId', how='left') \
    .drop('Team')

dataset.head()

PlayerId,PlayerName,Nationality,Age,Height,Weigh,Foot,Position,Value,Wage,Crossing,Finishing,Heading accuracy,Short passing,Volleys,Dribbling,Curve,FK Accuracy,Long passing,Ball control,Acceleration,Sprint speed,Agility,Reactions,Balance,Shot power,Jumping,Stamina,Strength,Long shots,Aggression,Interceptions,Att. Position,Vision,Penalties,Composure,Defensive awareness,Standing tackle,Sliding tackle,GK Diving,GK Handling,GK Kicking,GK Positioning,GK Reflexes,TeamId,League,LeagueId
i64,str,str,i64,i64,i64,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,i64
3061,"""Matteo Gabbia""","""Italy""",23,185,78,"""Right""","""SUB""","""€5.5M""","""€33K""",41,22,75,60,34,53,47,28,53,62,52,53,55,71,51,59,75,77,73,41,75,73,23,41,42,59,77,77,75,11,5,12,14,14,104,"""Serie A""",10
3412,"""Fabien Centonz…","""France""",27,182,75,"""Right""","""SUB""","""€1.8M""","""€14K""",69,42,61,69,49,65,65,47,66,68,75,77,71,58,75,64,77,81,75,54,71,68,66,66,50,69,66,71,69,14,12,9,12,13,121,"""Serie A""",10
4627,"""Tim Lemperle""","""Germany""",21,187,78,"""Right""","""LS""","""€2.7M""","""€9K""",64,73,61,63,55,73,53,45,55,72,77,70,67,58,63,66,76,55,69,51,49,27,69,53,55,58,30,33,30,9,10,8,7,7,163,"""2. Bundesliga""",6
5029,"""Niklas Tauer""","""Germany""",22,183,75,"""Right""","""SUB""","""€2.3M""","""€8K""",50,43,65,68,39,66,45,45,66,66,68,65,69,66,73,60,71,67,68,42,68,68,42,46,38,53,67,68,73,12,6,11,9,11,173,"""2. Bundesliga""",6
2373,"""Brian Ebenezer…","""Netherlands""",21,180,78,"""Right""","""ST""","""€32M""","""€15K""",35,79,70,73,67,67,49,44,53,77,82,85,65,78,86,79,90,73,92,64,70,26,83,52,66,75,28,23,24,10,11,15,8,13,87,"""Eredivisie""",3


### Feature engineering

#### NationalityId, PositionId, FootId, TeamId, LeagueId
Those columns contains categorical values, in order to not create an 'importance' scale based on their ID value, we need to encode them

- how can create numerical features without one-hot encoding
    - For : Nationality, Position, Foot, Team, League
        - y_level_mean = x.replace(y.groupby(x).mean()) 
        - JamesSteinEncoder
    - For Foot only
        -  WOEEncoder (for the foot)

In [16]:
dataset.head()

PlayerId,PlayerName,Nationality,Age,Height,Weigh,Foot,Position,Value,Wage,Crossing,Finishing,Heading accuracy,Short passing,Volleys,Dribbling,Curve,FK Accuracy,Long passing,Ball control,Acceleration,Sprint speed,Agility,Reactions,Balance,Shot power,Jumping,Stamina,Strength,Long shots,Aggression,Interceptions,Att. Position,Vision,Penalties,Composure,Defensive awareness,Standing tackle,Sliding tackle,GK Diving,GK Handling,GK Kicking,GK Positioning,GK Reflexes,TeamId,League,LeagueId
i64,str,str,i64,i64,i64,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,i64
3061,"""Matteo Gabbia""","""Italy""",23,185,78,"""Right""","""SUB""","""€5.5M""","""€33K""",41,22,75,60,34,53,47,28,53,62,52,53,55,71,51,59,75,77,73,41,75,73,23,41,42,59,77,77,75,11,5,12,14,14,104,"""Serie A""",10
3412,"""Fabien Centonz…","""France""",27,182,75,"""Right""","""SUB""","""€1.8M""","""€14K""",69,42,61,69,49,65,65,47,66,68,75,77,71,58,75,64,77,81,75,54,71,68,66,66,50,69,66,71,69,14,12,9,12,13,121,"""Serie A""",10
4627,"""Tim Lemperle""","""Germany""",21,187,78,"""Right""","""LS""","""€2.7M""","""€9K""",64,73,61,63,55,73,53,45,55,72,77,70,67,58,63,66,76,55,69,51,49,27,69,53,55,58,30,33,30,9,10,8,7,7,163,"""2. Bundesliga""",6
5029,"""Niklas Tauer""","""Germany""",22,183,75,"""Right""","""SUB""","""€2.3M""","""€8K""",50,43,65,68,39,66,45,45,66,66,68,65,69,66,73,60,71,67,68,42,68,68,42,46,38,53,67,68,73,12,6,11,9,11,173,"""2. Bundesliga""",6
2373,"""Brian Ebenezer…","""Netherlands""",21,180,78,"""Right""","""ST""","""€32M""","""€15K""",35,79,70,73,67,67,49,44,53,77,82,85,65,78,86,79,90,73,92,64,70,26,83,52,66,75,28,23,24,10,11,15,8,13,87,"""Eredivisie""",3


In [14]:
dataset = compute_numerical_feature_from_categorical(dataset, 'Value', 'NationalityId', 'Nationality')
dataset = compute_numerical_feature_from_categorical(dataset, 'Value', 'PositionId', 'Position')
dataset = compute_numerical_feature_from_categorical(dataset, 'Value', 'FootId', 'Foot')
dataset = compute_numerical_feature_from_categorical(dataset, 'Value', 'TeamId', 'Team')
dataset = compute_numerical_feature_from_categorical(dataset, 'Value', 'LeagueId', 'League')

dataset.head()

ColumnNotFoundError: NationalityId

Error originated just after this operation:
DF ["PlayerId", "PlayerName", "Nationality", "Age"]; PROJECT */47 COLUMNS; SELECTION: "None"

#### Save dataset to gold directory

In [None]:
#dataset.write_csv('gold/dataset.csv', separator=";")