In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
import lightgbm as ltb

from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score

import plotly.graph_objects as go
import plotly.express as px

#### NationalityId, PositionId, FootId, TeamId, LeagueId
Those columns contains categorical values, in order to not create an 'importance' scale based on their ID value, we need to encode them

- how can create numerical features without one-hot encoding
    - For : Nationality, Position, Foot, Team, League
        - y_level_mean = x.replace(y.groupby(x).mean()) 
        - JamesSteinEncoder
    - For Foot only
        -  WOEEncoder (for the foot)

In [59]:
def compute_numerical_feature_from_categorical(dataset, y, categorical_feature_name, numerical_feature_name):

    # Calculate the mean of the target variable grouped by the categorical feature
    feature_mean_value = dataset.groupby(categorical_feature_name)[y].mean().reset_index()

    # Rename the aggregated column to 'numerical_feature_name'
    feature_mean_value = feature_mean_value.rename(columns={y: numerical_feature_name})

    # Join the mean values back to the original dataset
    dataset = dataset.merge(feature_mean_value, on=categorical_feature_name, how='left')

    # Drop the original categorical feature and any duplicate columns that may have been created during the join
    dataset = dataset.drop(columns=[categorical_feature_name])

    return dataset, feature_mean_value

In [60]:
def apply_encoder(dataset, encoder, categorical_feature_name, numerical_feature_name):
    dataset = dataset.merge(encoder, on=categorical_feature_name, how='left')
    dataset[numerical_feature_name] = dataset[numerical_feature_name].fillna(dataset[numerical_feature_name].mean())
    dataset = dataset.drop(columns=[categorical_feature_name])

    return dataset

def get_player_name(player_ids, player_id_name):
    player_ids = pd.DataFrame(player_ids)
    player_name = player_ids \
        .merge(player_id_name, on='PlayerId', how='left') \
        .drop('PlayerId', axis=1)

    return player_name

def plot(df):
    # Residuals Plot
    fig_residuals = px.scatter(df, x='Predicted Salary', y='Residuals', hover_data=['PlayerName'])
    fig_residuals.update_layout(title='Residuals vs Predicted Salary', xaxis_title='Predicted Salary', yaxis_title='Predicted - Actual')

    # Prediction vs Actual Plot
    fig_pred_vs_actual = px.scatter(df, x='Actual Salary', y='Predicted Salary', hover_data=['PlayerName'])
    fig_pred_vs_actual.add_trace(go.Scatter(x=[df['Actual Salary'].min(), df['Actual Salary'].max()],
                                            y=[df['Actual Salary'].min(), df['Actual Salary'].max()],
                                            mode='lines', line=dict(dash='dash'), name='Perfect Prediction'))
    fig_pred_vs_actual.update_layout(title='Predicted vs Actual Salary', xaxis_title='Actual Salary', yaxis_title='Predicted Salary')

    # Histogram of Residuals
    fig_hist_residuals = px.histogram(df, x='Residuals')
    fig_hist_residuals.update_layout(title='Histogram of Residuals', xaxis_title='Residuals', yaxis_title='Count')

    # Show the plots
    fig_residuals.show()
    fig_pred_vs_actual.show()
    fig_hist_residuals.show()

def compute_metrics(y, y_pred):
    # Assuming predictions_df is your dataframe
    # Mean Absolute Error (MAE)
    mae = metrics.mean_absolute_error(y, y_pred)
    # Mean Squared Error (MSE)
    mse = metrics.mean_squared_error(y, y_pred)
    # Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mse)  # Or directly, metrics.mean_squared_error(y_test, y_pred, squared=False)
    # R-squared (R²)
    r2 = metrics.r2_score(y, y_pred)

    return mae, rmse, r2

def get_metrics(name_model, predictions_df, y_train, y_train_pred):

    mae_test, rmse_test, r2_test = compute_metrics(predictions_df['Actual Salary'], predictions_df['Predicted Salary'])
    mae_train, rmse_train, r2_train = compute_metrics(y_train, y_train_pred)

    computed_metrics = {
        'Model': [name_model],
        'Test - MAE': [mae_test],
        'Test - RMSE': [rmse_test],
        'Test - R-squared': [r2_test],
        'Train - MAE': [mae_train],
        'Train - RMSE': [rmse_train],
        'Train - R-squared': [r2_train]
    }

    computed_metrics = pd.DataFrame(computed_metrics)

    return computed_metrics

def train_test_score_model(model_name, model, scaler, player_names, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)

    # Transform the testing data (using the same scaler)
    y_pred = model.predict(scaler.transform(x_test))
    # Assuming 'predictions' is your array of predictions from the trained model
    adjusted_y_pred = np.maximum(y_pred, 0).astype(int)  

    # Create a DataFrame
    predictions_df = pd.DataFrame({
        'PlayerName': player_names['PlayerName'].to_list(),
        'Model': [model_name for i in range(len(adjusted_y_pred))],
        'Actual Salary': list(y_test),
        'Predicted Salary': list(adjusted_y_pred),
        'Residuals': list(adjusted_y_pred - y_test),
        'Percentage Error' : (list(adjusted_y_pred - y_test)/y_test)*100
    })

    y_pred_train = model.predict(x_train)
    adjusted_y_pred_train = np.maximum(y_pred_train, 0)
    computed_metrics = get_metrics(model_name, predictions_df, y_train, adjusted_y_pred_train)

    return model, predictions_df, computed_metrics

## Load dataset

In [61]:
dataset = pd.read_csv('gold/dataset.csv')
player_name_df = pd.read_csv('silver/PlayerName.csv')

## Training, Validation, Testing split

In [62]:
col_to_drop = ['Wage']
X = dataset.drop(col_to_drop, axis=1)  # Assuming 'Value' is the target
y = dataset['Wage']

In [63]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Process training and testing set

In [64]:
x_train = x_train.drop('PlayerId', axis=1)
test_player_ids = x_test['PlayerId']
player_names = get_player_name(test_player_ids, player_name_df)
x_test = x_test.drop('PlayerId', axis=1)

In [65]:
# encode categorical columns
x_train, nationality_encoder = compute_numerical_feature_from_categorical(x_train, 'Value', 'NationalityId', 'Nationality')
x_train, position_encoder = compute_numerical_feature_from_categorical(x_train, 'Value', 'PositionId', 'Position')
x_train, foot_encoder = compute_numerical_feature_from_categorical(x_train, 'Value', 'FootId', 'Foot')
x_train, team_encoder = compute_numerical_feature_from_categorical(x_train, 'Value', 'TeamId', 'Team')
x_train, league_encoder = compute_numerical_feature_from_categorical(x_train, 'Value', 'LeagueId', 'League')
x_train = x_train.drop('Value', axis=1)

x_test = apply_encoder(x_test, nationality_encoder, 'NationalityId', 'Nationality')
x_test = apply_encoder(x_test, position_encoder, 'PositionId', 'Position')
x_test = apply_encoder(x_test, foot_encoder, 'FootId', 'Foot')
x_test = apply_encoder(x_test, team_encoder, 'TeamId', 'Team')
x_test = apply_encoder(x_test, league_encoder, 'LeagueId', 'League')
x_test = x_test.drop('Value', axis=1)


scaler_type = 'MinMaxScaler'
if scaler_type == 'MinMaxScaler': scaler = MinMaxScaler(feature_range=(0, 1))
else: scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(x_train)

# Transform the training data
x_train_scaled = scaler.transform(x_train)

## Train - Test - Score Models

In [66]:
models = {
    'Linear Regressor': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(n_estimators=300, max_depth=30, random_state=42),
    'LightGBM': ltb.LGBMRegressor(colsample_bytree=0.7, learning_rate=0.1, max_depth=10, min_child_samples=20, n_estimators=300, num_leaves=31, subsample=0.8, random_state=42, verbose=-1),
    }

# Ridge cv
# Light gbm
# XG boost
# Any estimator using the Huber loss would also be robust to outliers, e.g. SGDRegressor with loss='huber'.

prediction_df = [] # pd.DataFrame(columns=['Player', 'Model', 'Actual Salary', 'Predicted Salary', 'Residuals'])
computed_metrics = [] # pd.DataFrame(columns=['Model', 'MAE', 'MSE', 'RMSE', 'R-squared', 'Adjusted R-squared'])

for name, model in models.items():
    trained_model, pred_df, metric_df = train_test_score_model(name, model, scaler, player_names, x_train_scaled, y_train, x_test, y_test)
    models[name] = trained_model
    prediction_df.append(pred_df)
    computed_metrics.append(metric_df)

prediction_df = pd.concat(prediction_df)
computed_metrics = pd.concat(computed_metrics)

In [67]:
computed_metrics

Unnamed: 0,Model,Test - MAE,Test - RMSE,Test - R-squared,Train - MAE,Train - RMSE,Train - R-squared
0,Linear Regressor,8309.682631,15811.861751,0.648991,7913.127288,15315.278583,0.650173
0,Random Forest Regressor,4330.353768,8953.440944,0.887453,1586.140931,3270.319695,0.984049
0,LightGBM,3770.487867,7813.327188,0.914291,1506.505914,2409.858582,0.991339


In [68]:
plot(prediction_df[prediction_df.Model == 'LightGBM'])

## Feature selection

In [69]:
# Get feature importances
importances = models['Random Forest Regressor'].feature_importances_

# Match feature names (assuming you have a list of names in feature_names)
feature_names = list(x_train.columns)  # replace with your actual feature names

features = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)

features_importance = pd.DataFrame(features, columns=['Feature', 'Importance'])

In [70]:
# Sort features by importance
sorted_indices = np.argsort(importances)[::-1]
sorted_feature_names = [feature_names[idx] for idx in sorted_indices]
sorted_importances = importances[sorted_indices]

# Create Plotly bar chart
fig = go.Figure([go.Bar(x=sorted_feature_names, y=sorted_importances)])
fig.update_layout(
    title='Feature Importances from Random Forest Regressor',
    xaxis_title='Features',
    yaxis_title='Importance',
    xaxis_tickangle=-45
)
fig.show()

## process prediction_df

In [71]:
player_df = pd.read_csv('silver\Player.csv')
player_team_df = pd.read_csv('silver\PlayerTeam.csv')
team_league_df = pd.read_csv('silver\TeamLeague.csv')

team_df = pd.read_csv('silver\Team.csv')
league_df = pd.read_csv('silver\League.csv')
nationality_df = pd.read_csv('silver/Nationality.csv')
position_df = pd.read_csv('silver\Position.csv')

In [72]:
prediction_df.head()

Unnamed: 0,PlayerName,Model,Actual Salary,Predicted Salary,Residuals,Percentage Error
1142,Jacopo Fazzini,Linear Regressor,3000,17145,14145,471.5
4567,Nathan Tella,Linear Regressor,36000,48241,12241,34.002778
2592,Janik Haberer,Linear Regressor,32000,31984,-16,-0.05
1677,Cristiano Piccini,Linear Regressor,2000,15837,13837,691.85
4958,Koen Van Langendonck,Linear Regressor,2000,573,-1427,-71.35


In [73]:
prediction_df = prediction_df[prediction_df.Model == 'LightGBM']

prediction_df = prediction_df \
        .merge(player_name_df, on='PlayerName', how='left') \
        .merge(player_df, on='PlayerId', how='left') \
        .merge(player_team_df, on='PlayerId', how='left') \
        .merge(team_league_df, on='TeamId', how='left')

prediction_df = prediction_df \
        .merge(nationality_df, on='NationalityId', how='left') \
        .merge(position_df, on='PositionId', how='left') \
        .merge(league_df, on='LeagueId', how='left') \
        .merge(team_df, on='TeamId', how='left')

prediction_df = prediction_df[['PlayerName', 'PlayerId', 'Model', 'Actual Salary',
                               'Predicted Salary','Residuals', 'Percentage Error',
                               'Nationality', 'Position', 'Team', 'League']]

# Assuming prediction_df is your existing DataFrame
# Add a new column 'Salary Assessment' based on 'Percentage Error'
prediction_df['Salary Assessment'] = prediction_df['Percentage Error'] \
        .apply(lambda x: 'Underpaid' if x > 10 else ('Overpaid' if x < -10 else 'Fair'))


prediction_df.head()

Unnamed: 0,PlayerName,PlayerId,Model,Actual Salary,Predicted Salary,Residuals,Percentage Error,Nationality,Position,Team,League,Salary Assessment
0,Jacopo Fazzini,3201,LightGBM,3000,10644,7644,254.8,Italy,Attacking Midfield,Empoli,Serie A,Underpaid
1,Nathan Tella,3551,LightGBM,36000,32087,-3913,-10.869444,Nigeria,Center Back,Bayer 04 Leverkusen,Bundesliga,Overpaid
2,Janik Haberer,3687,LightGBM,32000,25542,-6458,-20.18125,Germany,Attacking Midfield,FC Union Berlin,Bundesliga,Overpaid
3,Cristiano Piccini,6295,LightGBM,2000,5932,3932,196.6,Italy,Center Back,Sampdoria,Serie B,Underpaid
4,Koen Van Langendonck,4471,LightGBM,2000,2674,674,33.7,Belgium,Goal Keeper,Westerlo,Pro League,Underpaid


## Save datasets

In [74]:
prediction_df.to_csv('gold\Prediction.csv')
features_importance.to_csv('gold\FeatureImportance.csv')