# Representation learning and predictive analysis for Football managers

In this chapter we are going to try to model members of the amateur football club. This is related to one of the author's contributions to a wellbeing of a local football community. The ultimate goal is to split membors into a reasonable teams so that it is as enjoyable and competitive from sport's perspoective as possible.

## The challenge

For every Sunday's game we organise a poll to see who is joining a game. Coomon agreement is that we need minimum 10 and maximum 18 players - so it means we have a variable size teams.

- even though we have started to collect data for our game we still have around 20 games recorded. Distribution of the players also changed over the course of the recoring period so we have even less data for every single player. 

## The plan


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model, Input, initializers
from datetime import date
import numpy as np
import os

    
from competition_manager import *

In [None]:
import random


def set_seed(seed=42):
    np.random.seed(seed)                  # Fix NumPy random seed
    random.seed(seed)                     # Fix Python built-in random seed
    tf.random.set_seed(seed)              # Fix TensorFlow random seed

    # Optional but recommended: configure TensorFlow for deterministic ops
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

# Call this function at the very start, before building or training your model
seed_value = 42
set_seed(seed_value)

## Generate synthetic teams and games

### Calculating team's strength based on players individual strengths and rule-based interactions 

In [None]:
def calculate_team_strength(team_players):
    # Base strength sum
    strength = player_strengths[team_players].sum()
    print(f"Base strength of team {team_players}: {strength:.4f}")
    
    # Compute favorite player boost (fixed)
    for i, pair in enumerate(friend_pairs):
        if all(player in team_players for player in pair):
            boost = friend_pairs_boost[i]
            strength += boost
            print(f"Favorite pair boost applied for players {pair}: +{boost:.4f}, total strength now {strength:.4f}")
    for i, triplet in enumerate(friend_triplets):
        if all(player in team_players for player in triplet):
            boost = friend_triplets_boost[i]
            strength += boost
            print(f"Favorite triplet boost applied for players {triplet}: +{boost:.4f}, total strength now {strength:.4f}")
    
    # Compute skills boost correlated with players' average strength
    for i, pair in enumerate(skilled_pairs):
        if all(player in team_players for player in pair):
            avg_strength = player_strengths[list(pair)].mean()
            boost = skilled_pairs_boost[i] * avg_strength
            strength += boost
            print(f"Skills pair boost for players {pair}: avg strength {avg_strength:.4f} * boost factor {skilled_pairs_boost[i]:.4f} = +{boost:.4f}, total strength now {strength:.4f}")
    
    for i, triplet in enumerate(skilled_triplets):
        if all(player in team_players for player in triplet):
            avg_strength = player_strengths[list(triplet)].mean()
            boost = skilled_triplets_boost[i] * avg_strength
            strength += boost
            print(f"Skills triplet boost for players {triplet}: avg strength {avg_strength:.4f} * boost factor {skilled_triplets_boost[i]:.4f} = +{boost:.4f}, total strength now {strength:.4f}")
    
    return strength

### Generating the teams, friend pairs/triplets and skills pairs/triplets

In [None]:
np.random.seed(42)  # for reproducibility

NUM_PLAYERS = 30  # player 0 is ignore/masked and 1 is added to account for this
MIN_TEAM_SIZE = 5
MAX_TEAM_SIZE = 9
NUM_GAMES = 100

# 1. Generate players' strengths: single float number [0, 1]
player_strengths = np.random.rand(NUM_PLAYERS + 1) # player 0 is ignore/masked 

# 2. Generate favorite player pairs and triplets (friends)
num_friend_pairs = 10
num_friend_triplets = 5

# Randomly select unique pairs
friend_pairs = [tuple(np.random.choice(np.arange(1, NUM_PLAYERS +1), size=2, replace=False)) for _ in range(num_friend_pairs)]
friend_pairs_boost = np.random.uniform(0.05, 0.15, size=num_friend_pairs)  # small boost

# Randomly select unique triplets
friend_triplets = [tuple(np.random.choice(np.arange(1, NUM_PLAYERS +1), size=3, replace=False)) for _ in range(num_friend_triplets)]
friend_triplets_boost = np.random.uniform(0.1, 0.25, size=num_friend_triplets)  # larger boost

# 3. Generate skilled pairs and triplets (high skill synergy)
num_skilled_pairs = 8
num_skilled_triplets = 4

skilled_pairs = [tuple(np.random.choice(np.arange(1, NUM_PLAYERS + 1), size=2, replace=False)) for _ in range(num_skilled_pairs)]
skilled_pairs_boost = np.random.uniform(0.1, 0.2, size=num_skilled_pairs)  # moderate boost

skilled_triplets = [tuple(np.random.choice(np.arange(1, NUM_PLAYERS + 1), size=3, replace=False)) for _ in range(num_skilled_triplets)]
skilled_triplets_boost = np.random.uniform(0.15, 0.3, size=num_skilled_triplets)  # strong boost

# 2. Prepare arrays to hold the dataset
teamA_data = np.zeros((NUM_GAMES, MAX_TEAM_SIZE), dtype=int)
teamB_data = np.zeros((NUM_GAMES, MAX_TEAM_SIZE), dtype=int)
labels = np.zeros(NUM_GAMES)

def drop_zeroes_for_sum(players_strengths):
    return players_strengths[players_strengths !=0]

for game_i in range(NUM_GAMES):
    # Random sizes for both teams in [MIN_TEAM_SIZE, MAX_TEAM_SIZE]

    team_size = np.random.randint(MIN_TEAM_SIZE, MAX_TEAM_SIZE + 1)

    # Randomly sample distinct players for each team (sampling with replacement allowed for simplicity)
    # To avoid overlap if needed: sample without replacement from full 32 for both teams combined,
    # here assuming players can appear on both teams (as per original conversation)
    # teamA_players = np.random.choice(NUM_PLAYERS, size=teamA_size, replace=False)
    # teamB_players = np.random.choice(NUM_PLAYERS, size=teamB_size, replace=False)
    
    # Shuffle all players and split into two disjoint teams
    all_players = np.random.permutation(np.arange(1, NUM_PLAYERS + 1))
    teamA_players = all_players[:team_size]
    teamB_players = all_players[team_size:2*team_size]
    
    print(f"Game # {game_i} evaluation: ")
    # Compute team strengths as sum of player strengths
    teamA_strength = calculate_team_strength(teamA_players)
    teamB_strength = calculate_team_strength(teamB_players)
    print(f"=" * 50)
    
    # print(f"Team A strengths {player_strengths[teamA_players]} Total: {teamA_strength}")

    # Calculate match outcome: 1 if Team A wins, 0 if Team B wins
    # Add small noise to simulate unpredictability
    outcome = teamA_strength - teamB_strength + np.random.normal(scale=0.1)
    labels[game_i] = outcome

    # Pad teams to max size using zeros (which corresponds to masked player)
    teamA_data[game_i, :team_size] = teamA_players
    teamB_data[game_i, :team_size] = teamB_players

print("player_strengths shape:", player_strengths.shape)
print("teamA_data shape:", teamA_data.shape)
print("teamB_data shape:", teamB_data.shape)
print("labels shape:", labels.shape)

# Example print first 3 games
for i in range(3):
    print(f"Game {i}:")
    teamA_pls = teamA_data[i]
    print(" Team A players: ", teamA_pls)
    print(" Team A palyers' stregths: ", player_strengths[teamA_pls])

    teamB_pls = teamB_data[i]
    print(" Team B players: ", teamB_pls)
    print(f"Team B strengths {drop_zeroes_for_sum(player_strengths[teamB_pls]).sum()}")
    print(" Team B palyers' stregths: ", player_strengths[teamB_pls])
    print(" Label (Team A wins=1):", labels[i])


## Building a model

In [None]:

# Constants (adjust as needed)
PLAYER_EMB_DIM = 32
# NUM_CLASSES = 1  # Binary: win/loss

# Inputs: variable-length teams
teamA_input = Input(batch_shape=(80, 9), dtype='int32', name='teamA')  # variable-length
teamB_input = Input(batch_shape=(80, 9), dtype='int32', name='teamB')  # variable-length

# Embedding layer with mask support
player_embedding = layers.Embedding(
    input_dim=NUM_PLAYERS + 1,
    output_dim=PLAYER_EMB_DIM,
    embeddings_initializer=initializers.GlorotUniform(seed=seed_value),
    mask_zero=True,  # Important: enables automatic masking for padding (0 as pad token)
    # embeddings_regularizer=tf.keras.regularizers.l2(1e-4),
    name='player_embedding'
)

# Embed team players
teamA_embeds = player_embedding(teamA_input)  # shape: (batch, teamA_len, emb_dim)
teamB_embeds = player_embedding(teamB_input)

#Self-attention block (respects masks automatically if using Functional API)
def self_attention_block(x, name_prefix=''):
    attn_output = layers.MultiHeadAttention(
        num_heads=4,
        key_dim=PLAYER_EMB_DIM,
        dropout=0.1,
        name=f'{name_prefix}_attn'
    )(x, x)
    x = layers.Add(name=f'{name_prefix}_residual')([x, attn_output])
    x = layers.LayerNormalization(name=f'{name_prefix}_norm')(x)
    return x

# # Apply attention
teamA_attn = self_attention_block(teamA_embeds, 'teamA')
teamB_attn = self_attention_block(teamB_embeds, 'teamB')

# Global average pooling over valid (non-padded) tokens
# TF handles masking automatically in GlobalAveragePooling1D if mask_zero=True
teamA_vector = layers.GlobalAveragePooling1D(name='teamA_avgpool')(teamA_attn)
teamB_vector = layers.GlobalAveragePooling1D(name='teamB_avgpool')(teamB_attn)
teamA_vector = teamA_vector
teamB_vector = teamB_vector 

# Matchup modeling (difference vector)
matchup_vector = layers.Subtract(name='matchup_diff')([teamA_vector, teamB_vector])

# Concatenate summary representation
match_input = layers.Concatenate(name='match_features')([teamA_vector, teamB_vector, matchup_vector])
# match_input = layers.Concatenate(name='match_features')([teamA_vector, teamB_vector])

# Feedforward classification head
x = layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(match_input)
x = layers.Dropout(0.3)(x)
x = layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x)
x = layers.Dropout(0.3)(x)
output = layers.Dense(1, activation='linear', name='regression_output')(x)

# Final model
model = Model(inputs=[teamA_input, teamB_input], outputs=output)
model.compile(optimizer='adam', 
                loss='mean_squared_error',   # or 'mean_absolute_error'
                metrics=['mean_absolute_error']
                )

model.summary()


## Training the model

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

# Assume the following arrays from your dataset generation code:
# teamA_data, teamB_data, labels (all np arrays)

# 1. Train-validation split (80% train, 20% validation)
X_trainA, X_valA, X_trainB, X_valB, y_train, y_val = train_test_split(
    teamA_data, teamB_data, labels, test_size=0.2, random_state=42
)

# 2. Build or import your Keras model (reuse the model creation code from before)
# For example, let's say you have your variable-size team transformer model as 'model'

# 3. Optional: callbacks for monitoring
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Define a learning rate schedule function (step decay example)
def lr_schedule(epoch, lr):
    drop_rate = 0.5
    epochs_drop = 10
    if epoch > 0 and epoch % epochs_drop == 0:
        return lr * drop_rate
    return lr

#Instantiate callback
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_schedule)

# Or adaptive reduction on plateau (reduce LR when val_loss stalls)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5,
                                                 patience=5, min_lr=1e-6)


player_embedding_layer = model.get_layer("player_embedding")
embeddings_before_fold = player_embedding_layer.get_weights()[0]
print("Embedding vector for player zero before training:", embeddings_before_fold[0])

print(X_trainA)

# 4. Train the model
history = model.fit(
    [X_trainA, X_trainB],                 # Inputs as a list
    y_train,                              # Targets
    epochs=50,
    batch_size=32,
    validation_data=([X_valA, X_valB], y_val),
    callbacks=[lr_scheduler, reduce_lr, early_stop],
    # callbacks=[early_stop]
)

pe_layer = model.get_layer("player_embedding")
embeddings_after_fold = pe_layer.get_weights()[0]
print("Embedding vector for player zero after training:", embeddings_after_fold[0])

change = np.linalg.norm(embeddings_after_fold[0] - embeddings_before_fold[0])
print("Change in player zero embedding vector:", change)

# 5. Evaluate model performance
loss, accuracy = model.evaluate([X_valA, X_valB], y_val)
print(f"Validation accuracy: {accuracy:.3f}")

In [None]:
import matplotlib.pyplot as plt

# Assume 'history' is the object returned by your call to model.fit(...)

# Plot training and validation accuracy
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['mean_absolute_error'], label='Training MAE')
plt.plot(history.history['val_mean_absolute_error'], label='Validation MAE')
plt.xlabel('Epoch')
plt.ylabel('mean_absolute_error')
plt.title('Training vs Validation MAE')
plt.legend()

# Plot training and validation loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training vs Validation Loss')
plt.legend()

plt.tight_layout()
plt.show()

## Extracting embeddings of players

In [None]:
# Direct reference to layer
player_embeddings = player_embedding.get_weights()[0]
print(player_embeddings.shape)  # (NUM_PLAYERS, PLAYER_EMB_DIM)

### UMAP and visualizing players in 3D

In [None]:
import numpy as np
import umap
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.io as pio
pio.renderers.default = 'notebook'

# Generate test embeddings and labels (replace these with actual data)
np.random.seed(42)
# player_embeddings = np.random.rand(32, 32)
# labels = [f'{name}' for name in team_dict.values()]
labels_points = [f'Player {idx}:{name}' for idx, name in enumerate(player_strengths)]


# Embed to 3D
reducer = umap.UMAP(n_components=3, random_state=None, n_jobs=-1)
embeddings_3d = reducer.fit_transform(player_embeddings)

# Create interactive 3D scatter plot
fig = go.Figure(
    data=[go.Scatter3d(
        x=embeddings_3d[:, 0],
        y=embeddings_3d[:, 1],
        z=embeddings_3d[:, 2],
        mode='markers',
        marker=dict(
            size=7,
            color=player_strengths,       # Color by this array
            colorscale='Viridis',         # Choose a colorscale
            colorbar=dict(title='Strength'),
            opacity=0.8
        ),
        text=labels_points,          # Hover labels
        hoverinfo='text'
    )]
)

fig.update_layout(
    title="3D UMAP projection of player embeddings",
    width=1000,             # <-- Change this to your desired width in pixels
    height=800,  
    scene=dict(
        xaxis_title="UMAP-1",
        yaxis_title="UMAP-2",
        zaxis_title="UMAP-3"
    )
)

fig

### Calculate correlation of the embeddings with the original base strengths

In [None]:
import numpy as np
from scipy.stats import pearsonr

# Assuming these variables from your setup:
# player_strengths: numpy array of shape (num_players,)
# embeddings_3d: numpy array of shape (num_players, 3) -- UMAP 3D projections

num_players = player_strengths.shape[0]
correlations = []
print(player_strengths.shape)
print(embeddings_3d[:, 0].shape)
for dim in range(3):
    corr, p_value = pearsonr(embeddings_3d[:, dim], player_strengths)
    correlations.append((corr, p_value))
    print(f"Dimension {dim + 1} correlation with base strengths: r = {corr:.4f}, p-value = {p_value:.4g}")

# Optionally, compute average absolute correlation across all 3 dimensions
avg_abs_corr = np.mean([abs(c[0]) for c in correlations])
print(f"Average absolute correlation across 3 components: {avg_abs_corr:.4f}")