In [164]:
import pandas as pd
import numpy as np

import joblib
import json

In [165]:
def rearrange_players(input_dataset):
    # Code to Rearrange the players and make the 'home_4' column vacant
    home_team = ['home_0', 'home_1', 'home_2', 'home_3']
    for i in range(input_dataset.shape[0]):
        for players in home_team:
            if input_dataset.loc[i, players] == '?':
                input_dataset.loc[i, players] = input_dataset.loc[i, 'home_4']
    input_dataset.drop(['home_4'], inplace=True, axis = 1)
    return input_dataset

In [166]:
def get_players_list_for_current_team(season, home_team_name):
    if season > 2015:
        season = 2015
    with open('models/players_list_by_model/player_list_' + str(season) + '.json', 'r') as infile:
        players_file = json.load(infile)

    return players_file[str(season)][home_team_name]

In [167]:
def get_player_mappings_for_current_season(season):
    if season > 2015:
        season = 2015
    with open('models/player_mappings/player_mapping_' + str(season) + '.json', 'r') as infile:
        players_mapping = json.load(infile)

    return players_mapping

In [168]:
def read_model_based_on_season(season):
    if season > 2015:
        season = 2015
    embedding_model = joblib.load('models/player_embeddings/player_embeddings_' + str(season) + '.pkl')
    rf_model = joblib.load('models/random_forest/rf_model_' + str(season) + '.pkl')
    rf_scalar = joblib.load('models/random_forest/rf_scalar_' + str(season) + '.pkl')
    rf_pca = joblib.load('models/random_forest/rf_pca_' + str(season) + '.pkl')
    knn_model = joblib.load('models/knn/knn_model_' + str(season) + '.pkl')
    knn_scalar = joblib.load('models/knn/knn_scalar_' + str(season) + '.pkl')
    knn_pca = joblib.load('models/knn/knn_pca_' + str(season) + '.pkl')
    return embedding_model, rf_model, rf_scalar, rf_pca, knn_model, knn_scalar, knn_pca

In [170]:
def get_player_embeddings(embedding_model, home_team_players, away_team_players, player_mappings):
    # Storing the weights which are the 32-dimensional feature vectors for the home players in the dictionary.
    home_weights=embedding_model.get_layer('home_embedding').get_weights()[0]
    home_player_weights = {index: array for index, array in enumerate(home_weights)}
    
    # Storing the weights which are the 32-dimensional feature vectors for the away players in the dictionary.
    away_weights=embedding_model.get_layer('away_embedding').get_weights()[0]
    away_player_weights = {index: array for index, array in enumerate(away_weights)}

    home_team_players_embeddings = []
    for player in home_team_players:
        player_code = player_mappings.get(player)
        if player_code is None:
            # Player statistics not available in existing training dataset
            # FIX ME - player mapping not found for debutant players
            player_code = 0
            # raise ValueError(f"Player not found in mapping: {player}")

        embedding = home_player_weights.get(player_code)
        if embedding is None:
            raise ValueError(f"No embedding found for player: {player}")

        home_team_players_embeddings.append(embedding)

    away_team_players_embeddings = []
    for player in away_team_players:
        player_code = player_mappings.get(player)
        if player_code is None:
            # Player statistics not available in existing training dataset
            # FIX ME - player mapping not found for debutant players
            player_code = 0
            # raise ValueError(f"Player not found in mapping: {player}")

        embedding = away_player_weights.get(player_code)
        if embedding is None:
            raise ValueError(f"No embedding found for player: {player}")

        away_team_players_embeddings.append(embedding)

    return home_team_players_embeddings, away_team_players_embeddings

In [171]:
def predict_5_highest_probable_players(season, home_team, home_players_selected, away_players_selected):
    player_pool = list(set(get_players_list_for_current_team(season, home_team)) - set(home_players_selected))
    player_mapping = get_player_mappings_for_current_season(season)
    embedding_model, rf_model, rf_scalar, rf_pca, knn_model, knn_scalar, knn_pca = read_model_based_on_season(season)
    home_player_embeddings, away_player_embeddings = get_player_embeddings(embedding_model, home_players_selected, away_players_selected, player_mapping)
    candidate_player_embeddings, empty_embeddings = get_player_embeddings(embedding_model, player_pool, [], player_mapping)
    X_input = []
    for candidates in candidate_player_embeddings:
        combinations = home_player_embeddings + [candidates] + away_player_embeddings
        X_input.append(np.concatenate(combinations))
    
    X_input_flat = np.vstack(X_input)
    expected_shape = len(home_player_embeddings[0]) * len(home_player_embeddings) + \
                    len(away_player_embeddings[0]) * len(away_player_embeddings) + \
                    len(candidate_player_embeddings[0])

    if X_input_flat.shape[1] != expected_shape:
        raise ValueError(f"Incorrect feature dimension. Expected {expected_shape}, got {X_input_flat.shape[1]}")

    X_flat_scaled = rf_scalar.transform(X_input_flat)

    # Apply PCA to reduce dimensionality while preserving 95% of variance
    X_flat_reduced = rf_pca.transform(X_flat_scaled)

    rf_predictions = rf_model.predict(X_flat_reduced)
    rf_probabilities = rf_model.predict_proba(X_flat_reduced)
    print(len(rf_probabilities), len(player_pool))
    rf_results = []
    for i, player in enumerate(player_pool):
        rf_results.append({'player_name': player, 'win_probability': rf_probabilities[i][1]})

    rf_results.sort(key=lambda x: x['win_probability'], reverse=True)
    

    X_flat_scaled = knn_scalar.transform(X_input_flat)

    # Apply PCA to reduce dimensionality while preserving 95% of variance
    X_flat_reduced = knn_pca.transform(X_flat_scaled)

    knn_predictions = knn_model.predict(X_flat_reduced)
    knn_probabilities = knn_model.predict_proba(X_flat_reduced)
    knn_results = []
    for i, player in enumerate(player_pool):
        knn_results.append({'player_name': player, 'win_probability': knn_probabilities[i][1]})

    knn_results.sort(key=lambda x: x['win_probability'], reverse=True)

    return rf_results, knn_results

In [172]:
def predict(season, home_team, home_players_selected, away_players_selected):
    player_pool = list(set(get_players_list_for_current_team(season, home_team)) - set(home_players_selected))
    player_mapping = get_player_mappings_for_current_season(season)
    embedding_model, rf_model, rf_scalar, rf_pca, knn_model, knn_scalar, knn_pca = read_model_based_on_season(season)
    home_player_embeddings, away_player_embeddings = get_player_embeddings(embedding_model, home_players_selected, away_players_selected, player_mapping)
    candidate_player_embeddings, empty_embeddings = get_player_embeddings(embedding_model, player_pool, [], player_mapping)
    X_input = []
    for candidates in candidate_player_embeddings:
        combinations = home_player_embeddings + [candidates] + away_player_embeddings
        X_input.append(np.concatenate(combinations))
    
    X_input_flat = np.vstack(X_input)
    expected_shape = len(home_player_embeddings[0]) * len(home_player_embeddings) + \
                    len(away_player_embeddings[0]) * len(away_player_embeddings) + \
                    len(candidate_player_embeddings[0])

    if X_input_flat.shape[1] != expected_shape:
        raise ValueError(f"Incorrect feature dimension. Expected {expected_shape}, got {X_input_flat.shape[1]}")

    X_flat_scaled = rf_scalar.transform(X_input_flat)

    # Apply PCA to reduce dimensionality while preserving 95% of variance
    X_flat_reduced = rf_pca.transform(X_flat_scaled)

    rf_predictions = rf_model.predict(X_flat_reduced)
    rf_probabilities = rf_model.predict_proba(X_flat_reduced)
    rf_results = []
    for i, player in enumerate(player_pool):
        rf_results.append({'player_name': player, 'win_probability': rf_probabilities[i][1]})

    rf_results.sort(key=lambda x: x['win_probability'], reverse=True)

    X_flat_scaled = knn_scalar.transform(X_input_flat)

    # Apply PCA to reduce dimensionality while preserving 95% of variance
    X_flat_reduced = knn_pca.transform(X_flat_scaled)

    knn_predictions = knn_model.predict(X_flat_reduced)
    knn_probabilities = knn_model.predict_proba(X_flat_reduced)
    knn_results = []
    for i, player in enumerate(player_pool):
        knn_results.append({'player_name': player, 'win_probability': knn_probabilities[i][1]})

    knn_results.sort(key=lambda x: x['win_probability'], reverse=True)
    return rf_results, knn_results

In [173]:
def predict_highest_probable_player(rf_results, knn_results):
    rf_output = []
    i = 0
    rf_output.append(rf_results[0]['player_name'])
    while (i < len(rf_results) - 1) and (rf_results[i]['win_probability'] == rf_results[i + 1]['win_probability']):
        rf_output.append(rf_results[i + 1]['player_name'])
        i += 1
    knn_output = []
    i = 0
    knn_output.append(knn_results[0]['player_name'])
    while (i < len(knn_results) - 1) and (knn_results[i]['win_probability'] == knn_results[i + 1]['win_probability']):
        knn_output.append(knn_results[i + 1]['player_name'])
        i += 1

    return rf_output, knn_output

In [174]:
def predict_5_highest_probable_players(rf_results, knn_results):
    rf_output = []
    i = 4
    rf_output.append(rf_results[0]['player_name'])
    rf_output.append(rf_results[1]['player_name'])
    rf_output.append(rf_results[2]['player_name'])
    rf_output.append(rf_results[3]['player_name'])
    rf_output.append(rf_results[4]['player_name'])
    while (i < len(rf_results) - 1) and (rf_results[i]['win_probability'] == rf_results[i + 1]['win_probability']):
        rf_output.append(rf_results[i + 1]['player_name'])
        i += 1
    
    knn_output = []
    i = 4
    knn_output.append(knn_results[0]['player_name'])
    knn_output.append(knn_results[1]['player_name'])
    knn_output.append(knn_results[2]['player_name'])
    knn_output.append(knn_results[3]['player_name'])
    knn_output.append(knn_results[4]['player_name'])
    while (i < len(knn_results) - 1) and (knn_results[i]['win_probability'] == knn_results[i + 1]['win_probability']):
        knn_output.append(knn_results[i + 1]['player_name'])
        i += 1

In [None]:
def generate_outputs(input_dataset, expected_output):
    o1_count, o2_count = 0, 0
    for i in range(input_dataset.shape[0]):
        o1, o2 = predict(input_dataset.loc[i, 'season'], input_dataset.loc[i, 'home_team'], [input_dataset.loc[i, 'home_0'], input_dataset.loc[i, 'home_1'], input_dataset.loc[i, 'home_2'], input_dataset.loc[i, 'home_3']], [input_dataset.loc[i, 'away_0'], input_dataset.loc[i, 'away_1'], input_dataset.loc[i, 'away_2'], input_dataset.loc[i, 'away_3'], input_dataset.loc[i, 'away_4']])
        o1, o2 = predict_highest_probable_player(o1, o2)
        if expected_output.loc[i, 'removed_value'] in o1:
            o1_count += 1
        if expected_output.loc[i, 'removed_value'] in o2:
            o2_count += 1
        print(i, o1, o2, expected_output.loc[i, 'removed_value'] in o1, expected_output.loc[i, 'removed_value'] in o2, sep='\t')
    print(o1_count, o2_count, sep='\t')

In [176]:
input_dataset = pd.read_csv("testing_dataset/NBA_test.csv")
expected_output = pd.read_csv("testing_dataset/NBA_test_labels.csv")
input_dataset = rearrange_players(input_dataset)
generate_outputs(input_dataset, expected_output)

TypeError: predict_highest_probable_player() takes 2 positional arguments but 4 were given