In [28]:
import pandas as pd
import numpy as np

import joblib
import json

In [29]:
def rearrange_players(input_dataset):
    # Code to Rearrange the players and make the 'home_4' column vacant
    home_team = ['home_0', 'home_1', 'home_2', 'home_3']
    for i in range(input_dataset.shape[0]):
        for players in home_team:
            if input_dataset.loc[i, players] == '?':
                input_dataset.loc[i, players] = input_dataset.loc[i, 'home_4']
    input_dataset.drop(['home_4'], inplace=True, axis = 1)
    return input_dataset

In [30]:
def get_players_list_for_current_season(season):
    if season > 2015:
        season = 2015
    with open('models/players_list_by_model/player_list_' + str(season) + '.json', 'r') as infile:
        players_file = json.load(infile)

    return players_file[str(season)]

In [31]:
def get_player_mappings_for_current_season(season):
    if season > 2015:
        season = 2015
    with open('models/player_mappings/player_mapping_' + str(season) + '.json', 'r') as infile:
        players_mapping = json.load(infile)

    return players_mapping

In [32]:
def read_model_based_on_season(season):
    if season > 2015:
        season = 2015
    embedding_model = joblib.load('models/player_embeddings/player_embeddings_' + str(season) + '.pkl')
    rf_model = joblib.load('models/random_forest/rf_model_' + str(season) + '.pkl')
    rf_scalar = joblib.load('models/random_forest/rf_scalar_' + str(season) + '.pkl')
    rf_pca = joblib.load('models/random_forest/rf_pca_' + str(season) + '.pkl')
    knn_model = joblib.load('models/knn/knn_model_' + str(season) + '.pkl')
    knn_scalar = joblib.load('models/knn/knn_scalar_' + str(season) + '.pkl')
    knn_pca = joblib.load('models/knn/knn_pca_' + str(season) + '.pkl')
    return embedding_model, rf_model, rf_scalar, rf_pca, knn_model, knn_scalar, knn_pca

In [33]:
def load_all_saved_models_and_player_details(start_year, end_year):
    saved_data = {}
    for i in range(start_year, end_year):
        saved_data[i] = {}
        saved_data[i]['models'] = {}
        embedding_model, rf_model, rf_scalar, rf_pca, knn_model, knn_scalar, knn_pca = read_model_based_on_season(i)
        saved_data[i]['models']['embedding_model'] = embedding_model
        saved_data[i]['models']['rf_model'] = rf_model
        saved_data[i]['models']['rf_scalar'] = rf_scalar
        saved_data[i]['models']['rf_pca'] = rf_pca
        saved_data[i]['models']['knn_model'] = knn_model
        saved_data[i]['models']['knn_scalar'] = knn_scalar
        saved_data[i]['models']['knn_pca'] = knn_pca
        saved_data[i]['player_mappings'] = get_player_mappings_for_current_season(i)
        saved_data[i]['player_pool'] = get_players_list_for_current_season(i)

    return saved_data

In [34]:
def handle_unknown_players(player_mappings, unknown_player_name):
    known_player_name = None
    for i in range(125):
        if 'player_' + str(i) in player_mappings:
            known_player_name = 'player_' + str(i)
            break
        if i == 124:
            raise ValueError(f"Player not found in mapping: {player}")
    player_mappings[unknown_player_name] = player_mappings[known_player_name]
    del player_mappings[known_player_name]
    return player_mappings

In [None]:
def get_player_embeddings(embedding_model, team_players, player_mappings, layer_name):
    # Storing the weights which are the 32-dimensional feature vectors for the players in the dictionary.
    weights=embedding_model.get_layer(layer_name).get_weights()[0]
    player_weights = {index: array for index, array in enumerate(weights)}

    team_players_embeddings = []
    player_not_found = False
    for player in team_players:
        player_code = player_mappings.get(player)
        if player_code is None:
            # Player statistics not available in existing training dataset
            # FIX ME - player mapping not found for debutant players
            # player_mappings = handle_unknown_players(player_mappings, player)
            # player_code = player_mappings[player]
            raise ValueError(f"Player not found in mapping: {player}")

        embedding = player_weights.get(player_code)
        if embedding is None:
            raise ValueError(f"No embedding found for player: {player}")

        team_players_embeddings.append(embedding)

    return team_players_embeddings, player_mappings

In [36]:
def create_group_of_5_home_players(home_players_selected, player_from_pool):
    combined_home_players_selected = []
    for i in range(len(home_players_selected)):
        if home_players_selected[i] == '?':
            combined_home_players_selected = home_players_selected[:i]
            combined_home_players_selected = combined_home_players_selected + [player_from_pool] + home_players_selected[i + 1:]
            break
    return combined_home_players_selected

In [38]:
def predict(saved_data, season, home_team, home_players_selected, away_players_selected):
    if season > 2015:
        season = 2015
    player_pool = list(set(saved_data[season]['player_pool'][home_team]) - set(home_players_selected))
    player_mapping = saved_data[season]['player_mappings']
    embedding_model = saved_data[season]['models']['embedding_model']
    rf_model, rf_scalar, rf_pca = saved_data[season]['models']['rf_model'], saved_data[season]['models']['rf_scalar'], saved_data[season]['models']['rf_pca']
    knn_model, knn_scalar, knn_pca = saved_data[season]['models']['knn_model'], saved_data[season]['models']['knn_scalar'], saved_data[season]['models']['knn_pca']
    
    away_player_embeddings, player_mapping = get_player_embeddings(embedding_model, away_players_selected, player_mapping, 'away_embedding')
    if player_mapping != saved_data[season]['player_mappings']:
        saved_data[season]['player_mappings'] = player_mapping

    X_input = []
    for players in player_pool:
        combined_home_players_selected = create_group_of_5_home_players(home_players_selected, players)
        home_player_embeddings, player_mapping = get_player_embeddings(embedding_model, combined_home_players_selected, player_mapping, 'home_embedding')
        if player_mapping != saved_data[season]['player_mappings']:
            saved_data[season]['player_mappings'] = player_mapping
        X_input.append(np.concatenate(home_player_embeddings + away_player_embeddings))
    
    X_input_flat = np.vstack(X_input)
    expected_shape = len(home_player_embeddings[0]) * len(home_player_embeddings) + \
                    len(away_player_embeddings[0]) * len(away_player_embeddings)

    if X_input_flat.shape[1] != expected_shape:
        raise ValueError(f"Incorrect feature dimension. Expected {expected_shape}, got {X_input_flat.shape[1]}")

    X_flat_scaled = rf_scalar.transform(X_input_flat)

    # Apply PCA to reduce dimensionality while preserving 95% of variance
    X_flat_reduced = rf_pca.transform(X_flat_scaled)

    rf_predictions = rf_model.predict(X_flat_reduced)
    rf_probabilities = rf_model.predict_proba(X_flat_reduced)
    rf_results = []
    for i, player in enumerate(player_pool):
        rf_results.append({'player_name': player, 'win_probability': rf_probabilities[i][1]})

    rf_results.sort(key=lambda x: x['win_probability'], reverse=True)

    X_flat_scaled = knn_scalar.transform(X_input_flat)

    # Apply PCA to reduce dimensionality while preserving 95% of variance
    X_flat_reduced = knn_pca.transform(X_flat_scaled)

    knn_predictions = knn_model.predict(X_flat_reduced)
    knn_probabilities = knn_model.predict_proba(X_flat_reduced)
    knn_results = []
    for i, player in enumerate(player_pool):
        knn_results.append({'player_name': player, 'win_probability': knn_probabilities[i][1]})

    knn_results.sort(key=lambda x: x['win_probability'], reverse=True)
    return rf_results, knn_results

In [39]:
def predict_highest_probable_player(rf_results, knn_results):
    rf_output = []
    i = 0
    rf_output.append(rf_results[0]['player_name'])
    while (i < len(rf_results) - 1) and (rf_results[i]['win_probability'] == rf_results[i + 1]['win_probability']):
        rf_output.append(rf_results[i + 1]['player_name'])
        i += 1
    knn_output = []
    i = 0
    knn_output.append(knn_results[0]['player_name'])
    while (i < len(knn_results) - 1) and (knn_results[i]['win_probability'] == knn_results[i + 1]['win_probability']):
        knn_output.append(knn_results[i + 1]['player_name'])
        i += 1

    return rf_output, knn_output

In [68]:
def predict_3_highest_probable_players(rf_results, knn_results):
    rf_output = []
    i = 4
    rf_output.append(rf_results[0]['player_name'])
    rf_output.append(rf_results[1]['player_name'])
    rf_output.append(rf_results[2]['player_name'])
    while (i < len(rf_results) - 1) and (rf_results[i]['win_probability'] == rf_results[i + 1]['win_probability']):
        rf_output.append(rf_results[i + 1]['player_name'])
        i += 1
    
    knn_output = []
    i = 4
    knn_output.append(knn_results[0]['player_name'])
    knn_output.append(knn_results[1]['player_name'])
    knn_output.append(knn_results[2]['player_name'])
    while (i < len(knn_results) - 1) and (knn_results[i]['win_probability'] == knn_results[i + 1]['win_probability']):
        knn_output.append(knn_results[i + 1]['player_name'])
        i += 1
    
    return rf_output, knn_output

In [40]:
def predict_5_highest_probable_players(rf_results, knn_results):
    rf_output = []
    i = 4
    rf_output.append(rf_results[0]['player_name'])
    rf_output.append(rf_results[1]['player_name'])
    rf_output.append(rf_results[2]['player_name'])
    rf_output.append(rf_results[3]['player_name'])
    rf_output.append(rf_results[4]['player_name'])
    while (i < len(rf_results) - 1) and (rf_results[i]['win_probability'] == rf_results[i + 1]['win_probability']):
        rf_output.append(rf_results[i + 1]['player_name'])
        i += 1
    
    knn_output = []
    i = 4
    knn_output.append(knn_results[0]['player_name'])
    knn_output.append(knn_results[1]['player_name'])
    knn_output.append(knn_results[2]['player_name'])
    knn_output.append(knn_results[3]['player_name'])
    knn_output.append(knn_results[4]['player_name'])
    while (i < len(knn_results) - 1) and (knn_results[i]['win_probability'] == knn_results[i + 1]['win_probability']):
        knn_output.append(knn_results[i + 1]['player_name'])
        i += 1
    
    return rf_output, knn_output

In [41]:
def generate_outputs(input_dataset, saved_data):
    predicted_outputs = []
    home_players_not_found_count, away_players_not_found_count = 0, 0
    for i in range(input_dataset.shape[0]):
        rf, knn = predict(saved_data, input_dataset.loc[i, 'season'], input_dataset.loc[i, 'home_team'], [input_dataset.loc[i, 'home_0'], input_dataset.loc[i, 'home_1'], input_dataset.loc[i, 'home_2'], input_dataset.loc[i, 'home_3'], input_dataset.loc[i, 'home_4']], [input_dataset.loc[i, 'away_0'], input_dataset.loc[i, 'away_1'], input_dataset.loc[i, 'away_2'], input_dataset.loc[i, 'away_3'], input_dataset.loc[i, 'away_4']])
        rf, knn = predict_highest_probable_player(rf, knn)
        predicted_outputs.append({'random_forest': rf, 'knn': knn})
    return predicted_outputs

In [69]:
def generate_3_highest_players_outputs(input_dataset, saved_data):
    predicted_outputs = []
    for i in range(input_dataset.shape[0]):
        rf, knn = predict(saved_data, input_dataset.loc[i, 'season'], input_dataset.loc[i, 'home_team'], [input_dataset.loc[i, 'home_0'], input_dataset.loc[i, 'home_1'], input_dataset.loc[i, 'home_2'], input_dataset.loc[i, 'home_3'], input_dataset.loc[i, 'home_4']], [input_dataset.loc[i, 'away_0'], input_dataset.loc[i, 'away_1'], input_dataset.loc[i, 'away_2'], input_dataset.loc[i, 'away_3'], input_dataset.loc[i, 'away_4']])
        rf, knn = predict_3_highest_probable_players(rf, knn)
        predicted_outputs.append({'random_forest': rf, 'knn': knn})
    return predicted_outputs

In [42]:
def generate_5_highest_players_outputs(input_dataset, saved_data):
    predicted_outputs = []
    for i in range(input_dataset.shape[0]):
        rf, knn = predict(saved_data, input_dataset.loc[i, 'season'], input_dataset.loc[i, 'home_team'], [input_dataset.loc[i, 'home_0'], input_dataset.loc[i, 'home_1'], input_dataset.loc[i, 'home_2'], input_dataset.loc[i, 'home_3'], input_dataset.loc[i, 'home_4']], [input_dataset.loc[i, 'away_0'], input_dataset.loc[i, 'away_1'], input_dataset.loc[i, 'away_2'], input_dataset.loc[i, 'away_3'], input_dataset.loc[i, 'away_4']])
        rf, knn = predict_5_highest_probable_players(rf, knn)
        predicted_outputs.append({'random_forest': rf, 'knn': knn})
    return predicted_outputs

In [43]:
def test_accuracy(predicted_outputs, expected_output):
    rf_count, knn_count = 0, 0
    for i in range(len(predicted_outputs)):
        if expected_output.loc[i, 'removed_value'] in predicted_outputs[i]['random_forest']:
            rf_count += 1
        if expected_output.loc[i, 'removed_value'] in predicted_outputs[i]['knn']:
            knn_count += 1
    print("Random Forest accuracy = ", rf_count / 10, "%\nKNN accuracy = ", knn_count / 10, "%", sep='')

In [65]:
def test_accuracy_by_year(input_dataset, predicted_outputs, expected_output):
    results = {}
    for i in range(len(predicted_outputs)):
        if input_dataset.loc[i, 'season'] not in results:
            results[input_dataset.loc[i, 'season']] = {}
            results[input_dataset.loc[i, 'season']]['size'] = 0
            results[input_dataset.loc[i, 'season']]['random_forest'] = 0
            results[input_dataset.loc[i, 'season']]['knn'] = 0
        results[input_dataset.loc[i, 'season']]['size'] += 1
        if expected_output.loc[i, 'removed_value'] in predicted_outputs[i]['random_forest']:
            results[input_dataset.loc[i, 'season']]['random_forest'] += 1
        results[input_dataset.loc[i, 'season']]['rf_accuracy'] = str(float((results[input_dataset.loc[i, 'season']]['random_forest'] * 100) / results[input_dataset.loc[i, 'season']]['size'])) + '%'
        if expected_output.loc[i, 'removed_value'] in predicted_outputs[i]['knn']:
            results[input_dataset.loc[i, 'season']]['knn'] += 1
        results[input_dataset.loc[i, 'season']]['knn_accuracy'] = str(float((results[input_dataset.loc[i, 'season']]['knn'] * 100) / results[input_dataset.loc[i, 'season']]['size'])) + '%'

    return results

Predict only 1 player with highest win probability

In [45]:
input_dataset = pd.read_csv("testing_dataset/NBA_test.csv")
saved_data = load_all_saved_models_and_player_details(2007, 2016)
predicted_outputs = generate_outputs(input_dataset, saved_data)
print(predicted_outputs)

[{'random_forest': ['Troy Murphy'], 'knn': ['Troy Murphy']}, {'random_forest': ['Chuck Hayes'], 'knn': ['Chuck Hayes']}, {'random_forest': ['Brent Barry'], 'knn': ['Brent Barry']}, {'random_forest': ['Craig Smith'], 'knn': ['Craig Smith', 'Justin Reed']}, {'random_forest': ['Pau Gasol'], 'knn': ['Tarence Kinsey']}, {'random_forest': ['Charlie Bell'], 'knn': ['Charlie Bell', 'Ruben Patterson', 'Julius Hodge']}, {'random_forest': ['Dwyane Wade'], 'knn': ['Dwyane Wade']}, {'random_forest': ['Anderson Varejao'], 'knn': ['Anderson Varejao']}, {'random_forest': ['Keith McLeod'], 'knn': ["Patrick O'Bryant", 'Mike Dunleavy', 'Josh Powell']}, {'random_forest': ['Yakhouba Diawara'], 'knn': ['Anthony Carter', 'Yakhouba Diawara']}, {'random_forest': ['Scott Padgett'], 'knn': ['Damon Stoudamire']}, {'random_forest': ['Paul Millsap'], 'knn': ['Gordan Giricek']}, {'random_forest': ['Rafael Araujo'], 'knn': ['Roger Powell']}, {'random_forest': ['Jarrett Jack'], 'knn': ['Ime Udoka']}, {'random_forest':

In [46]:
expected_output = pd.read_csv("testing_dataset/NBA_test_labels.csv")
test_accuracy(predicted_outputs, expected_output)
print(test_accuracy_by_year(input_dataset, predicted_outputs, expected_output))

Random Forest accuracy = 51.7%
KNN accuracy = 56.7%
{np.int64(2007): {'size': 100, 'random_forest': 63, 'knn': 66}, np.int64(2008): {'size': 100, 'random_forest': 60, 'knn': 68}, np.int64(2009): {'size': 100, 'random_forest': 60, 'knn': 56}, np.int64(2010): {'size': 100, 'random_forest': 58, 'knn': 70}, np.int64(2011): {'size': 100, 'random_forest': 56, 'knn': 61}, np.int64(2012): {'size': 100, 'random_forest': 58, 'knn': 62}, np.int64(2013): {'size': 100, 'random_forest': 53, 'knn': 53}, np.int64(2014): {'size': 100, 'random_forest': 55, 'knn': 61}, np.int64(2015): {'size': 100, 'random_forest': 52, 'knn': 60}, np.int64(2016): {'size': 100, 'random_forest': 2, 'knn': 10}}


Predict top 5 players with highest win probability

In [180]:
input_dataset = pd.read_csv("testing_dataset/NBA_test.csv")
saved_data = load_all_saved_models_and_player_details(2007, 2016)
predicted_outputs = generate_5_highest_players_outputs(input_dataset, saved_data)
print(predicted_outputs)

[{'random_forest': ['Troy Murphy', 'Orien Greene', 'Shawne Williams', 'David Harrison', 'Ike Diogu'], 'knn': ['Troy Murphy', 'Jeff Foster', 'Stephen Jackson', 'Marquis Daniels', 'Al Harrington']}, {'random_forest': ['Chuck Hayes', 'Jake Tsakalidis', 'Yao Ming', 'Dikembe Mutombo', 'Shane Battier'], 'knn': ['Dikembe Mutombo', 'Chuck Hayes', 'Scott Padgett', 'Shane Battier', 'Yao Ming']}, {'random_forest': ['Brent Barry', 'Melvin Ely', 'Jacque Vaughn', 'Francisco Elson', 'Robert Horry'], 'knn': ['Brent Barry', 'Fabricio Oberto', 'Tony Parker', 'Jacque Vaughn', 'Jackie Butler']}, {'random_forest': ['Craig Smith', 'Justin Reed', 'Rashad McCants', 'Bracey Wright', 'Mark Blount'], 'knn': ['Justin Reed', 'Craig Smith', 'Mark Blount', 'Marko Jaric', 'Troy Hudson']}, {'random_forest': ['Kyle Lowry', 'Lawrence Roberts', 'Pau Gasol', 'Dahntay Jones', 'Brian Cardinal'], 'knn': ['Lawrence Roberts', 'Kyle Lowry', 'Pau Gasol', 'Dahntay Jones', 'Tarence Kinsey']}, {'random_forest': ['Charlie Bell', 'Da

In [181]:
expected_output = pd.read_csv("testing_dataset/NBA_test_labels.csv")
test_accuracy(predicted_outputs, expected_output)
print(test_accuracy_by_year(input_dataset, predicted_outputs, expected_output))

Random Forest accuracy = 71.6%
KNN accuracy = 69.8%
{np.int64(2007): {'size': 100, 'random_forest': 84, 'knn': 78}, np.int64(2008): {'size': 100, 'random_forest': 80, 'knn': 80}, np.int64(2009): {'size': 100, 'random_forest': 82, 'knn': 79}, np.int64(2010): {'size': 100, 'random_forest': 82, 'knn': 80}, np.int64(2011): {'size': 100, 'random_forest': 72, 'knn': 77}, np.int64(2012): {'size': 100, 'random_forest': 77, 'knn': 78}, np.int64(2013): {'size': 100, 'random_forest': 73, 'knn': 67}, np.int64(2014): {'size': 100, 'random_forest': 70, 'knn': 70}, np.int64(2015): {'size': 100, 'random_forest': 74, 'knn': 70}, np.int64(2016): {'size': 100, 'random_forest': 22, 'knn': 19}}


Predictions for 2016 data samples show very low accuracy
This is due to the unknown players playing in 2016 season who do not have any track record or have not played yet

In [193]:
expected_output_2016 = expected_output.tail(100).reset_index(drop = True)
input_dataset_2016 = input_dataset.tail(100).reset_index(drop = True)
unknown_target_player = 0
for i in range(input_dataset_2016.shape[0]):
    print(i, expected_output_2016.loc[i, 'removed_value'])
    if expected_output_2016.loc[i, 'removed_value'] not in saved_data[int(input_dataset_2016.loc[i, 'season']) - 1]['player_pool'][input_dataset_2016.loc[i, 'home_team']]:
        unknown_target_player += 1

print(unknown_target_player)

0 Elfrid Payton
1 T.J. McConnell
2 Tony Parker
3 Aron Baynes
4 Allen Crabbe
5 Kevin Durant
6 Devin Booker
7 Thomas Robinson
8 Shabazz Muhammad
9 Nerlens Noel
10 Norris Cole
11 Nene Hilario
12 P.J. Tucker
13 Klay Thompson
14 C.J. Miles
15 LaMarcus Aldridge
16 Charlie Villanueva
17 Andrew Bogut
18 Ian Mahinmi
19 Boris Diaw
20 Bradley Beal
21 Kosta Koufos
22 Joe Ingles
23 Damian Lillard
24 Jonas Valanciunas
25 Leandro Barbosa
26 Joakim Noah
27 Josh Smith
28 Jarrett Jack
29 Paul Pierce
30 Ricky Rubio
31 Elfrid Payton
32 Gerald Green
33 Shane Larkin
34 J.J. Barea
35 Dion Waiters
36 Jordan Clarkson
37 Jonas Valanciunas
38 P.J. Tucker
39 Anthony Brown
40 Tristan Thompson
41 Damian Lillard
42 Derrick Williams
43 Kendrick Perkins
44 Carl Landry
45 Tyus Jones
46 Greg Monroe
47 Kevin Durant
48 Joe Johnson
49 Dwight Howard
50 Derrick Favors
51 Gerald Henderson
52 Arron Afflalo
53 Chris Bosh
54 Lavoy Allen
55 Taj Gibson
56 Tristan Thompson
57 J.R. Smith
58 George Hill
59 Tiago Splitter
60 Tyler Zel

In [57]:
input_dataset = pd.read_csv("testing_dataset/NBA_test.csv")
expected_output = pd.read_csv("testing_dataset/NBA_test_labels.csv")
saved_data = load_all_saved_models_and_player_details(2007, 2016)

# Function to find number of samples with unknown players
filtered_dataset = input_dataset.tail(100).reset_index(drop = True)
filtered_dataset['removed_value'] = expected_output.tail(100).reset_index(drop = True)['removed_value']
player_columns = ['home_0', 'home_1', 'home_2', 'home_3', 'home_4', 'away_0', 'away_1', 'away_2', 'away_3', 'away_4', 'removed_value']
for i in range(filtered_dataset.shape[0]):
    season = 2015 if int(filtered_dataset.loc[i, 'season']) == 2016 else int(filtered_dataset.loc[i, 'season'])
    for player_column in player_columns:
        if (filtered_dataset.loc[i, player_column] != '?') and (filtered_dataset.loc[i, player_column] not in saved_data[season]['player_mappings']):
            filtered_dataset.drop([i], inplace = True)
            break
filtered_dataset.reset_index(drop = True, inplace = True)
print(filtered_dataset.shape)

(33, 15)


In [58]:
input_dataset = pd.concat([input_dataset.head(900), filtered_dataset[filtered_dataset.columns[:-1]]])
input_dataset.reset_index(drop = True, inplace = True)
predicted_outputs = generate_outputs(input_dataset, saved_data)
print(predicted_outputs)

[{'random_forest': ['Troy Murphy'], 'knn': ['Troy Murphy']}, {'random_forest': ['Chuck Hayes'], 'knn': ['Chuck Hayes']}, {'random_forest': ['Brent Barry'], 'knn': ['Brent Barry']}, {'random_forest': ['Craig Smith'], 'knn': ['Craig Smith', 'Justin Reed']}, {'random_forest': ['Pau Gasol'], 'knn': ['Tarence Kinsey']}, {'random_forest': ['Charlie Bell'], 'knn': ['Charlie Bell', 'Ruben Patterson', 'Julius Hodge']}, {'random_forest': ['Dwyane Wade'], 'knn': ['Dwyane Wade']}, {'random_forest': ['Anderson Varejao'], 'knn': ['Anderson Varejao']}, {'random_forest': ['Keith McLeod'], 'knn': ["Patrick O'Bryant", 'Mike Dunleavy', 'Josh Powell']}, {'random_forest': ['Yakhouba Diawara'], 'knn': ['Anthony Carter', 'Yakhouba Diawara']}, {'random_forest': ['Scott Padgett'], 'knn': ['Damon Stoudamire']}, {'random_forest': ['Paul Millsap'], 'knn': ['Gordan Giricek']}, {'random_forest': ['Rafael Araujo'], 'knn': ['Roger Powell']}, {'random_forest': ['Jarrett Jack'], 'knn': ['Ime Udoka']}, {'random_forest':

In [64]:
expected_output = pd.concat([expected_output.head(900), filtered_dataset['removed_value']])
expected_output.reset_index(drop = True, inplace = True)
test_accuracy(predicted_outputs, expected_output)
print(test_accuracy_by_year(input_dataset, predicted_outputs, expected_output))

Random Forest accuracy = 51.5%
KNN accuracy = 56.4%
{np.int64(2007): {'size': 100, 'random_forest': 63, 'knn': 66, 'rf_accuracy': '63.0%', 'knn_accuracy': '66.0%'}, np.int64(2008): {'size': 100, 'random_forest': 60, 'knn': 68, 'rf_accuracy': '60.0%', 'knn_accuracy': '68.0%'}, np.int64(2009): {'size': 100, 'random_forest': 60, 'knn': 56, 'rf_accuracy': '61.224489795918366%', 'knn_accuracy': '56.56565656565657%'}, np.int64(2010): {'size': 100, 'random_forest': 58, 'knn': 70, 'rf_accuracy': '58.0%', 'knn_accuracy': '70.0%'}, np.int64(2011): {'size': 100, 'random_forest': 56, 'knn': 61, 'rf_accuracy': '56.0%', 'knn_accuracy': '61.0%'}, np.int64(2012): {'size': 100, 'random_forest': 58, 'knn': 62, 'rf_accuracy': '58.0%', 'knn_accuracy': '62.0%'}, np.int64(2013): {'size': 100, 'random_forest': 53, 'knn': 53, 'rf_accuracy': '53.0%', 'knn_accuracy': '53.0%'}, np.int64(2014): {'size': 100, 'random_forest': 55, 'knn': 61, 'rf_accuracy': '55.0%', 'knn_accuracy': '61.0%'}, np.int64(2015): {'size':

Predict 3 players for 5th position in home team with highest home team win probability

In [None]:
predict_3_highest_players_outputs = generate_3_highest_players_outputs(input_dataset, saved_data)
print(predict_3_highest_players_outputs)

In [None]:
test_accuracy(predict_3_highest_players_outputs, expected_output)
print(test_accuracy_by_year(input_dataset, predict_3_highest_players_outputs, expected_output))

Predict 5 players for 5th position in home team with highest home team win probability

In [67]:
predict_5_highest_players_outputs = generate_5_highest_players_outputs(input_dataset, saved_data)
print(predict_5_highest_players_outputs)

[{'random_forest': ['Troy Murphy', 'Orien Greene', 'Shawne Williams', 'Al Harrington', 'Josh Powell'], 'knn': ['Troy Murphy', 'Jamaal Tinsley', 'Josh Powell', 'Stephen Jackson', 'Rawle Marshall']}, {'random_forest': ['Chuck Hayes', 'Dikembe Mutombo', 'Yao Ming', 'Shane Battier', 'Rafer Alston'], 'knn': ['Chuck Hayes', 'Dikembe Mutombo', 'Yao Ming', 'Rafer Alston', 'John Lucas III']}, {'random_forest': ['Brent Barry', 'Robert Horry', 'Francisco Elson', 'Melvin Ely', 'Jacque Vaughn'], 'knn': ['Brent Barry', 'Melvin Ely', 'James White', 'Robert Horry', 'Tony Parker']}, {'random_forest': ['Craig Smith', 'Mark Blount', 'Troy Hudson', 'Mike James', 'Rashad McCants'], 'knn': ['Craig Smith', 'Justin Reed', 'Mark Blount', 'Eddie Griffin', 'Marko Jaric']}, {'random_forest': ['Pau Gasol', 'Stromile Swift', 'Lawrence Roberts', 'Junior Harrington', 'Alexander Johnson'], 'knn': ['Tarence Kinsey', 'Alexander Johnson', 'Kyle Lowry', 'Dahntay Jones', 'Damon Stoudamire']}, {'random_forest': ['Charlie Be

In [66]:
test_accuracy(predict_5_highest_players_outputs, expected_output)
print(test_accuracy_by_year(input_dataset, predict_5_highest_players_outputs, expected_output))

Random Forest accuracy = 70.0%
KNN accuracy = 69.4%
{np.int64(2007): {'size': 100, 'random_forest': 81, 'knn': 80, 'rf_accuracy': '81.0%', 'knn_accuracy': '80.0%'}, np.int64(2008): {'size': 100, 'random_forest': 83, 'knn': 78, 'rf_accuracy': '83.0%', 'knn_accuracy': '78.0%'}, np.int64(2009): {'size': 100, 'random_forest': 80, 'knn': 74, 'rf_accuracy': '80.0%', 'knn_accuracy': '74.0%'}, np.int64(2010): {'size': 100, 'random_forest': 83, 'knn': 77, 'rf_accuracy': '83.0%', 'knn_accuracy': '77.0%'}, np.int64(2011): {'size': 100, 'random_forest': 77, 'knn': 76, 'rf_accuracy': '77.0%', 'knn_accuracy': '76.0%'}, np.int64(2012): {'size': 100, 'random_forest': 72, 'knn': 77, 'rf_accuracy': '72.0%', 'knn_accuracy': '77.0%'}, np.int64(2013): {'size': 100, 'random_forest': 68, 'knn': 69, 'rf_accuracy': '68.0%', 'knn_accuracy': '69.0%'}, np.int64(2014): {'size': 100, 'random_forest': 71, 'knn': 75, 'rf_accuracy': '71.0%', 'knn_accuracy': '75.0%'}, np.int64(2015): {'size': 100, 'random_forest': 75, 