# Soccer master

## Model training

This is the model training part of the project and for this you'll require the data file which is generated after running soccer_master_eda.ipynb

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import sklearn.metrics as metrics
from sklearn.utils import shuffle
from abc import ABC, abstractmethod
import numpy as np

# Pre-processed data loading

In [2]:
transfer_data = pd.read_csv("C:\\Users\\devdp\\OneDrive\\Documents\\Github\\UMBC\\soccer_master\\data\\transfers_data.csv")
transfer_data.head()

Unnamed: 0,player_id,from_club_id,to_club_id,transfer_fee,market_value_in_eur,season,home_games,playing_formation,appearances,goals_per_game,...,win_percentage,draw_percentage,home_win_percentage,away_win_percentage,position,sub_position,foot,avg_market_value_of_last_seasons,age_at_transfer,transfer_diff
0,3333,10161,399,0.0,0.0,2002,35.0,4-2-3-1,68.0,0.0,...,0.208333,0.263889,0.316993,0.088889,Goalkeeper,Goalkeeper,left,1500000.0,16.0,0.0
1,3333,399,352,0.0,0.0,2003,32.0,5-3-2,62.0,0.086207,...,0.371473,0.356322,0.464706,0.272321,Defender,Left-Back,left,800000.0,17.0,0.0
2,3333,352,399,0.0,0.0,2003,15.0,4-1-4-1,29.0,0.137931,...,0.62069,0.241379,0.866667,0.357143,Attack,Centre-Forward,right,55000000.0,17.0,0.0
3,3333,399,762,7400000.0,7400000.0,2004,255.0,4-3-3 Attacking,501.0,0.060938,...,0.679278,0.161885,0.767983,0.580309,Midfield,Central Midfield,right,29666670.0,18.0,0.0
4,3333,762,405,0.0,5500000.0,2005,12.0,3-4-1-2,28.0,0.0,...,0.035714,0.25,0.0,0.0625,Defender,Left-Back,left,1000000.0,19.0,-5500000.0


In [3]:
def process_formation_data(dataframe, training_year_threshold):
    """
    Process and standardize formation data by replacing rare formations.

    Args:
        dataframe: Transfer data DataFrame
        training_year_threshold: Year to split training data

    Returns:
        DataFrame with processed formation data
    """
    processed_df = dataframe.copy()

    # Identify common formations (10+ occurrences) in training data
    formation_frequency = processed_df[processed_df['season'] < training_year_threshold]['playing_formation'].value_counts()
    frequent_formations = formation_frequency[formation_frequency >= 10].index

    # Standardize rare and missing formations
    processed_df.loc[~processed_df['playing_formation'].isin(frequent_formations), 'playing_formation'] = 'Other'
    processed_df['playing_formation'] = processed_df['playing_formation'].fillna('Unknown')

    return processed_df

def prepare_model_data(dataframe, training_year_threshold=2023):
    """
    Prepare data for model training by splitting and identifying feature types.

    Args:
        dataframe: Transfer data DataFrame
        training_year_threshold: Year to split training/test data (default: 2023)

    Returns:
        X_train, X_test, y_train, y_test, categorical_features, numerical_features
    """
    processed_df = process_formation_data(dataframe, training_year_threshold)

    # Split data by year
    train_data = processed_df[processed_df['season'] < training_year_threshold].copy()
    test_data = processed_df[processed_df['season'] >= training_year_threshold].copy()

    # Separate features and target
    X_train = train_data.drop(['transfer_fee'], axis=1)
    y_train = np.array(train_data['transfer_fee'])
    X_test = test_data.drop(['transfer_fee'], axis=1)
    y_test = np.array(test_data['transfer_fee'])

    # Identify feature types
    categorical_features = ['position', 'sub_position', 'foot', 'playing_formation']
    numerical_features = [col for col in X_train.columns if col not in categorical_features]

    return X_train, X_test, y_train, y_test, categorical_features, numerical_features

In [4]:
def transform_features(features_df, categorical_columns, numerical_columns, existing_transformer=None):
    """
    Transform features using preprocessing pipeline for model training/inference.

    Args:
        features_df: Feature DataFrame
        categorical_columns: List of categorical feature names
        numerical_columns: List of numerical feature names
        existing_transformer: Optional fitted transformer for inference

    Returns:
        transformed_features: Preprocessed feature matrix
        transformer: Fitted transformer pipeline
    """
    feature_preprocessor = ColumnTransformer([
        ('numerical', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_columns),
        ('categorical', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
        ]), categorical_columns)
    ])

    preprocessing_pipeline = Pipeline([
        ('preprocessor', feature_preprocessor),
    ])

    if existing_transformer is None:
        transformed_features = preprocessing_pipeline.fit_transform(features_df)
        return transformed_features, preprocessing_pipeline
    else:
        transformed_features = existing_transformer.transform(features_df)
        return transformed_features, existing_transformer

### Data Preparation Pipeline

1. Split data into training/test sets (Using transfers before 2021 as training and rest as test data)
2. Transform features using preprocessing pipeline
3. Ensure consistent preprocessing across train/test data

In [5]:
print("\nInitiating data preparation pipeline...")

# Split data into train and test sets
X_train, X_test, y_train, y_test, categorical_features, numerical_features = prepare_model_data(transfer_data, training_year_threshold=2021)

print("\nData preparation pipeline initiated successfully!")
print(f"✓ Data split complete: {len(X_train)} training samples, {len(X_test)} test samples")

# Transform training data and get fitted transformer
X_train_transformed, feature_transformer = transform_features(
    X_train,
    categorical_features,
    numerical_features
)
print(f"✓ Training data transformed: {X_train_transformed.shape[1]} features generated")

# Transform test data using fitted transformer
X_test_transformed, _ = transform_features(
    X_test,
    categorical_features,
    numerical_features,
    existing_transformer=feature_transformer
)

target_scaler = StandardScaler()
target_scaler.fit(y_train.reshape(-1, 1))  # Ensure y_train is 2D

y_train_transformed = target_scaler.transform(y_train.reshape(-1, 1))
y_test_transformed = target_scaler.transform(y_test.reshape(-1, 1))


print(f"✓ Test data transformed: {X_test_transformed.shape[1]} features generated")

print("\nData preparation completed successfully!")


Initiating data preparation pipeline...

Data preparation pipeline initiated successfully!
✓ Data split complete: 46502 training samples, 15067 test samples
✓ Training data transformed: 66 features generated
✓ Test data transformed: 66 features generated

Data preparation completed successfully!


# Defining neural network

## Defining **Layer**

In [6]:
class Layer(ABC):
    """The base class for NN model layer"""
    def __init__(self):
        super().__init__()
    
    @abstractmethod
    def forward(self, x):
        raise NotImplementedError
    
    @abstractmethod
    def backward(self, dout):
        raise NotImplementedError



## Defining **Linear** layer

In [7]:
class Linear(Layer):
    def __init__(self, input_dim, output_dim):
        self.params = {}
        self.params['W'] = np.random.randn(input_dim, output_dim) / np.sqrt(input_dim)  # normalized weights
        self.params['b'] = np.random.randn(output_dim)

        self.grads = {}

    def forward(self, x):
        self.x = x
        
        out = np.dot(x, self.params['W']) + self.params['b']
        
        return out 

    def backward(self, dout):

        self.grads['W'] = np.dot(self.x.T, dout)
        self.grads['b'] = np.sum(dout, axis=0)

        return np.dot(dout, self.params['W'].T)

## Defining **ReLU** activation function

In [8]:
class ReLU(Layer):
    def __init__(self):
        self.params = None
    
    def forward(self, x):
        self.mask = (x<=0)
        out = x.copy()
        out[self.mask] = 0
        
        return out

    def backward(self, dout):
        dx = dout.copy()  # Avoid modifying input gradient directly
        dx[self.mask] = 0
        return dx

## Defining Output layer with linear activation and Mean squared error loss

In [9]:
class LinearWithMSE(Layer):
    def __init__(self):
        self.params = None

    def forward(self, out, y):
        '''
            out: predicted values
            y: true labels
        '''
        # batch_size = out.shape[0]
        self.out = out
        self.y = y
        
        loss = np.mean((out - y) ** 2)
        return loss, out

    def backward(self, dout):
        batch_size = self.y.shape[0]
        return (2 * (self.out - self.y)) / batch_size

## Defining **Neural Network**

In [10]:
class FNN(Layer):
    def __init__(self):
        self.layers = []
        self.layers.append(Linear(66, 40))
        self.layers.append(ReLU())
        self.layers.append(Linear(40, 20))
        self.layers.append(ReLU())
        self.layers.append(Linear(20, 1))
        self.layers.append(LinearWithMSE())

    def forward(self, x, y):
        batch_size = x.shape[0]
        
        x = x.reshape(batch_size, 66)

        for layer in self.layers[:-1]:
            x = layer.forward(x)
        
        return self.layers[-1].forward(x, y)

    def predict(self, x):
        batch_size = x.shape[0]
        
        x = x.reshape(batch_size, 66)

        for layer in self.layers[:-1]:
            x = layer.forward(x)
        
        return x
    
    def backward(self):
        dout = self.layers[-1].backward(1)
        for layer in self.layers[::-1]:
            dout = layer.backward(dout)

## Defining base **Optimizer** class

In [11]:
class Optimizer(ABC):
    """The base class for optimizer."""
    def __init__(self, learning_rate, layers):
        super().__init__()

    @abstractmethod
    def update(self):
        raise NotImplementedError

## Defining **Stochastic Gradient Descent** optimizer 

In [12]:
class SGD(Optimizer):
    """SGD (Stochastic Gradient Descent) optimizer"""
    def __init__(self, learning_rate, layers):
        self.learning_rate = learning_rate
        self.layers = layers

    def update(self):
        for layer in self.layers:
            if hasattr(layer, 'params') and layer.params is not None:
                for key in layer.params.keys():
                    layer.params[key] -= self.learning_rate * layer.grads[key]


In [13]:
# prompt: provide a generator to provide data as batches to  model

def data_generator(data, label, batch_size):
  """
  Generates batches of data and labels.

  Args:
    data: The input data.
    label: The corresponding labels.
    batch_size: The size of each batch.

  Yields:
    A tuple of (data_batch, label_batch).
  """
  num_samples = len(data)
  num_batches = num_samples // batch_size

  for i in range(num_batches):
    start = i * batch_size
    end = (i + 1) * batch_size
    yield data[start:end], label[start:end]

  # If there are remaining samples, yield a smaller batch.
  if num_samples % batch_size != 0:
    yield data[num_batches * batch_size:], label[num_batches * batch_size:]


In [14]:
def train(model, optimizer, train_data, train_label, target_scaler, epochs, batch_size):
    """
    Trains and validates a neural network model.

    Args:
        model: The neural network model.
        optimizer: The optimizer used for training.
        train_data: The training data.
        train_label: The training labels.
        epochs: The number of training epochs.
        batch_size: The batch size for training.
    """
    for epoch in range(epochs):
        # Shuffle training data for each epoch
        train_data, train_label = shuffle(train_data, train_label)

        # Training loop
        train_loss = 0
        for data_batch, label_batch in data_generator(train_data, train_label, batch_size):
            loss, output = model.forward(data_batch, label_batch)
            train_loss += loss * len(data_batch)

            model.backward()
            optimizer.update()

        print(f'Epoch: {epoch + 1}/{epochs}, Train Loss: {train_loss / len(train_data):.6f}')

In [15]:
def test(model, test_data, test_label, target_scaler, batch_size=256):
    """
    Tests the neural network model on the test dataset.

    Args:
        model: The neural network model.
        test_data: The test data.
        test_label: The test labels.
        target_scaler: The scaler used to scale the target variable.
        batch_size: The batch size for testing.
    """
    predictions = []
    actuals = []
    total_loss = 0  # Initialize total loss
    num_samples = len(test_data)
    num_batches = (num_samples + batch_size - 1) // batch_size

    for i in range(num_batches):
        start = i * batch_size
        end = min((i + 1) * batch_size, num_samples)

        data_batch = test_data[start:end]
        label_batch = test_label[start:end]

        # Forward pass to get the loss and prediction
        loss, output = model.forward(data_batch, label_batch)
        total_loss += loss * len(data_batch)  # Accumulate loss

        # Store predictions and actuals
        predictions_scaled = model.predict(data_batch).flatten()
        predictions_original = target_scaler.inverse_transform(predictions_scaled.reshape(-1, 1)).flatten()
        actuals_original = target_scaler.inverse_transform(label_batch.reshape(-1, 1)).flatten()

        predictions.extend(predictions_original)
        actuals.extend(actuals_original)

    # Convert to numpy arrays
    predictions = np.array(predictions)
    actuals = np.array(actuals)

    # Calculate Metrics
    mae = metrics.mean_absolute_error(actuals, predictions)
    mse = metrics.mean_squared_error(actuals, predictions)
    r2 = metrics.r2_score(actuals, predictions)
    rmse = np.sqrt(mse)

    print(f"Test Loss (MSE): {total_loss / num_samples:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"R-squared (R2): {r2:.4f}")

    return mae, rmse, r2

In [16]:
model_fnn = FNN()
learning_rate = 0.1
epochs = 20
batch_size = 256
optimizer = SGD(learning_rate, model_fnn.layers)

train(model_fnn, optimizer, X_train_transformed, np.array(y_train_transformed).reshape(-1,1), target_scaler, epochs, batch_size)


Epoch: 1/20, Train Loss: 0.569629
Epoch: 2/20, Train Loss: 0.004092
Epoch: 3/20, Train Loss: 0.002410
Epoch: 4/20, Train Loss: 0.001686
Epoch: 5/20, Train Loss: 0.001385
Epoch: 6/20, Train Loss: 0.000997
Epoch: 7/20, Train Loss: 0.000882
Epoch: 8/20, Train Loss: 0.000886
Epoch: 9/20, Train Loss: 0.000710
Epoch: 10/20, Train Loss: 0.000607
Epoch: 11/20, Train Loss: 0.000572
Epoch: 12/20, Train Loss: 0.000510
Epoch: 13/20, Train Loss: 0.000427
Epoch: 14/20, Train Loss: 0.000417
Epoch: 15/20, Train Loss: 0.000385
Epoch: 16/20, Train Loss: 0.000313
Epoch: 17/20, Train Loss: 0.000308
Epoch: 18/20, Train Loss: 0.000267
Epoch: 19/20, Train Loss: 0.000294
Epoch: 20/20, Train Loss: 0.000255


In [17]:
mae, rmse, r2 = test(model_fnn, X_test_transformed, np.array(y_test_transformed).reshape(-1,1), target_scaler)

Test Loss (MSE): 0.0004
Mean Absolute Error (MAE): 33600.3867
Root Mean Squared Error (RMSE): 86187.7947
R-squared (R2): 0.9998


In [18]:
def format_currency(number):
    """
    Format number as currency with thousands separators (e.g., 35,000,000)
    """
    return "{:,.0f}".format(number)

In [19]:
def predict_player_transfer(player_id, data, model, transformer, categorical_cols, numerical_cols):
    """
    Predict transfer fee for a specific player
    """
    # Get player data
    player_data = data.query(f'player_id=={player_id}').iloc[0]
    actual_fee = player_data["transfer_fee"]

    # Prepare features
    features = pd.DataFrame(player_data.drop("transfer_fee")).T
    transformed_features, _ = transform_features(
        features,
        categorical_cols,
        numerical_cols,
        existing_transformer=transformer
    )

    # Predict on the scaled data
    y_pred_scaled = model.predict(transformed_features)  # Get the output (scaled prediction)

    # Ensure it's in the correct shape before inverse transforming
    y_pred_original = target_scaler.inverse_transform(y_pred_scaled.reshape(1, -1)).reshape(-1)[0]

    print(f"\nTransfer Fee Prediction:")
    print(f"Predicted: {format_currency(y_pred_original)} euros")
    print(f"Actual: {format_currency(actual_fee)} euros")
    print(f"Difference: {format_currency(abs(y_pred_original - actual_fee))} euros")

    return y_pred_original, actual_fee

In [20]:
# Predict transfer fee
predicted_fee, actual_fee = predict_player_transfer(
    12589,
    transfer_data,
    model_fnn,
    feature_transformer,
    categorical_features,
    numerical_features
)


Transfer Fee Prediction:
Predicted: 2,829,583 euros
Actual: 2,705,000 euros
Difference: 124,583 euros
