In [81]:
import pandas as pd
import numpy as np

import joblib
import json
import csv

In [82]:
def get_players_list_for_current_season(season):
    if season > 2015:
        season = 2015
    with open('models/players_list_by_model/player_list_' + str(season) + '.json', 'r') as infile:
        players_file = json.load(infile)

    return players_file[str(season)]

In [83]:
def get_player_mappings_for_current_season(season):
    if season > 2015:
        season = 2015
    with open('models/player_mappings/player_mapping_' + str(season) + '.json', 'r') as infile:
        players_mapping = json.load(infile)

    return players_mapping

In [84]:
def read_model_based_on_season(season):
    if season > 2015:
        season = 2015
    embedding_model = joblib.load('models/player_embeddings/player_embeddings_' + str(season) + '.pkl')
    rf_model = joblib.load('models/random_forest/rf_model_' + str(season) + '.pkl')
    rf_scalar = joblib.load('models/random_forest/rf_scalar_' + str(season) + '.pkl')
    rf_pca = joblib.load('models/random_forest/rf_pca_' + str(season) + '.pkl')
    knn_model = joblib.load('models/knn/knn_model_' + str(season) + '.pkl')
    knn_scalar = joblib.load('models/knn/knn_scalar_' + str(season) + '.pkl')
    knn_pca = joblib.load('models/knn/knn_pca_' + str(season) + '.pkl')
    return embedding_model, rf_model, rf_scalar, rf_pca, knn_model, knn_scalar, knn_pca

In [85]:
def load_all_saved_models_and_player_details(start_year, end_year):
    saved_data = {}
    for i in range(start_year, end_year):
        saved_data[i] = {}
        saved_data[i]['models'] = {}
        embedding_model, rf_model, rf_scalar, rf_pca, knn_model, knn_scalar, knn_pca = read_model_based_on_season(i)
        saved_data[i]['models']['embedding_model'] = embedding_model
        saved_data[i]['models']['rf_model'] = rf_model
        saved_data[i]['models']['rf_scalar'] = rf_scalar
        saved_data[i]['models']['rf_pca'] = rf_pca
        saved_data[i]['models']['knn_model'] = knn_model
        saved_data[i]['models']['knn_scalar'] = knn_scalar
        saved_data[i]['models']['knn_pca'] = knn_pca
        saved_data[i]['player_mappings'] = get_player_mappings_for_current_season(i)
        saved_data[i]['player_pool'] = get_players_list_for_current_season(i)

    return saved_data

In [86]:
def handle_unknown_players(player_mappings, unknown_player_name):
    known_player_name = None
    for i in range(125):
        if 'player_' + str(i) in player_mappings:
            known_player_name = 'player_' + str(i)
            break
        if i == 124:
            raise ValueError(f"Player not found in mapping")
    player_mappings[unknown_player_name] = player_mappings[known_player_name]
    del player_mappings[known_player_name]
    return player_mappings

In [87]:
def get_player_embeddings(embedding_model, team_players, player_mappings, layer_name):
    # Storing the weights which are the 32-dimensional feature vectors for the players in the dictionary.
    weights=embedding_model.get_layer(layer_name).get_weights()[0]
    player_weights = {index: array for index, array in enumerate(weights)}

    team_players_embeddings = []
    player_not_found = False
    for player in team_players:
        player_code = player_mappings.get(player)
        if player_code is None:
            # Player statistics not available in existing training dataset
            # FIX ME - player mapping not found for debutant players
            player_mappings = handle_unknown_players(player_mappings, player)
            player_code = player_mappings[player]
            # raise ValueError(f"Player not found in mapping: {player}")

        embedding = player_weights.get(player_code)
        if embedding is None:
            raise ValueError(f"No embedding found for player: {player}")

        team_players_embeddings.append(embedding)

    return team_players_embeddings, player_mappings

In [88]:
def create_group_of_5_home_players(home_players_selected, player_from_pool):
    combined_home_players_selected = []
    for i in range(len(home_players_selected)):
        if home_players_selected[i] == '?':
            combined_home_players_selected = home_players_selected[:i]
            combined_home_players_selected = combined_home_players_selected + [player_from_pool] + home_players_selected[i + 1:]
            break
    return combined_home_players_selected

In [89]:
def predict(saved_data, season, home_team, home_players_selected, away_players_selected):
    if season > 2015:
        season = 2015
    player_pool = list(set(saved_data[season]['player_pool'][home_team]) - set(home_players_selected))
    player_mapping = saved_data[season]['player_mappings']
    embedding_model = saved_data[season]['models']['embedding_model']
    rf_model, rf_scalar, rf_pca = saved_data[season]['models']['rf_model'], saved_data[season]['models']['rf_scalar'], saved_data[season]['models']['rf_pca']
    knn_model, knn_scalar, knn_pca = saved_data[season]['models']['knn_model'], saved_data[season]['models']['knn_scalar'], saved_data[season]['models']['knn_pca']
    
    away_player_embeddings, player_mapping = get_player_embeddings(embedding_model, away_players_selected, player_mapping, 'away_embedding')
    if player_mapping != saved_data[season]['player_mappings']:
        saved_data[season]['player_mappings'] = player_mapping

    X_input = []
    for players in player_pool:
        combined_home_players_selected = create_group_of_5_home_players(home_players_selected, players)
        home_player_embeddings, player_mapping = get_player_embeddings(embedding_model, combined_home_players_selected, player_mapping, 'home_embedding')
        if player_mapping != saved_data[season]['player_mappings']:
            saved_data[season]['player_mappings'] = player_mapping
        X_input.append(np.concatenate(home_player_embeddings + away_player_embeddings))
    
    X_input_flat = np.vstack(X_input)
    expected_shape = len(home_player_embeddings[0]) * len(home_player_embeddings) + \
                    len(away_player_embeddings[0]) * len(away_player_embeddings)

    if X_input_flat.shape[1] != expected_shape:
        raise ValueError(f"Incorrect feature dimension. Expected {expected_shape}, got {X_input_flat.shape[1]}")

    X_flat_scaled = rf_scalar.transform(X_input_flat)

    # Apply PCA to reduce dimensionality while preserving 95% of variance
    X_flat_reduced = rf_pca.transform(X_flat_scaled)

    rf_predictions = rf_model.predict(X_flat_reduced)
    rf_probabilities = rf_model.predict_proba(X_flat_reduced)
    rf_results = []
    for i, player in enumerate(player_pool):
        rf_results.append({'player_name': player, 'win_probability': rf_probabilities[i][1]})

    rf_results.sort(key=lambda x: x['win_probability'], reverse=True)

    X_flat_scaled = knn_scalar.transform(X_input_flat)

    # Apply PCA to reduce dimensionality while preserving 95% of variance
    X_flat_reduced = knn_pca.transform(X_flat_scaled)

    knn_predictions = knn_model.predict(X_flat_reduced)
    knn_probabilities = knn_model.predict_proba(X_flat_reduced)
    knn_results = []
    for i, player in enumerate(player_pool):
        knn_results.append({'player_name': player, 'win_probability': knn_probabilities[i][1]})

    knn_results.sort(key=lambda x: x['win_probability'], reverse=True)
    return rf_results, knn_results

In [90]:
def predict_n_highest_probable_player(rf_results, knn_results, n):
    rf_output = {}
    i = 0
    while (i < len(rf_results)) and (i < n):
        rf_output[rf_results[i]['player_name']] = float(rf_results[i]['win_probability'])
        i += 1
    i -= 1
    while (i < len(rf_results) - 1) and (rf_results[i]['win_probability'] == rf_results[i + 1]['win_probability']):
        rf_output[rf_results[i + 1]['player_name']] = float(rf_results[i + 1]['win_probability'])
        i += 1
    knn_output = {}
    i = 0
    while (i < len(rf_results)) and (i < n):
        knn_output[knn_results[i]['player_name']] = float(knn_results[i]['win_probability'])
        i += 1
    i -= 1
    while (i < len(knn_results) - 1) and (knn_results[i]['win_probability'] == knn_results[i + 1]['win_probability']):
        knn_output[knn_results[i + 1]['player_name']] = float(knn_results[i + 1]['win_probability'])
        i += 1
    return rf_output, knn_output

In [91]:
def generate_n_outputs(input_dataset, saved_data, n):
    predicted_outputs = []
    for i in range(input_dataset.shape[0]):
        rf, knn = predict(saved_data, input_dataset.loc[i, 'season'], input_dataset.loc[i, 'home_team'], [input_dataset.loc[i, 'home_0'], input_dataset.loc[i, 'home_1'], input_dataset.loc[i, 'home_2'], input_dataset.loc[i, 'home_3'], input_dataset.loc[i, 'home_4']], [input_dataset.loc[i, 'away_0'], input_dataset.loc[i, 'away_1'], input_dataset.loc[i, 'away_2'], input_dataset.loc[i, 'away_3'], input_dataset.loc[i, 'away_4']])
        rf, knn = predict_n_highest_probable_player(rf, knn, n)
        predicted_outputs.append({'random_forest': rf, 'knn': knn})
    return predicted_outputs

In [92]:
def test_accuracy(predicted_outputs, expected_output):
    rf_count, knn_count = 0, 0
    for i in range(len(predicted_outputs)):
        if expected_output.loc[i, 'removed_value'] in predicted_outputs[i]['random_forest'].keys():
            rf_count += 1
        if expected_output.loc[i, 'removed_value'] in predicted_outputs[i]['knn'].keys():
            knn_count += 1
    print("Random Forest accuracy = ", rf_count / 10, "%\nKNN accuracy = ", knn_count / 10, "%", sep='')

In [93]:
def test_accuracy_by_year(input_dataset, predicted_outputs, expected_output):
    results = {}
    for i in range(len(predicted_outputs)):
        if input_dataset.loc[i, 'season'] not in results:
            results[input_dataset.loc[i, 'season']] = {}
            results[input_dataset.loc[i, 'season']]['size'] = 0
            results[input_dataset.loc[i, 'season']]['random_forest'] = 0
            results[input_dataset.loc[i, 'season']]['knn'] = 0
        results[input_dataset.loc[i, 'season']]['size'] += 1
        if expected_output.loc[i, 'removed_value'] in predicted_outputs[i]['random_forest'].keys():
            results[input_dataset.loc[i, 'season']]['random_forest'] += 1
        results[input_dataset.loc[i, 'season']]['rf_accuracy'] = str(float((results[input_dataset.loc[i, 'season']]['random_forest'] * 100) / results[input_dataset.loc[i, 'season']]['size'])) + '%'
        if expected_output.loc[i, 'removed_value'] in predicted_outputs[i]['knn'].keys():
            results[input_dataset.loc[i, 'season']]['knn'] += 1
        results[input_dataset.loc[i, 'season']]['knn_accuracy'] = str(float((results[input_dataset.loc[i, 'season']]['knn'] * 100) / results[input_dataset.loc[i, 'season']]['size'])) + '%'

    return results

Predict only 1 player with highest win probability

In [94]:
input_dataset = pd.read_csv("testing_dataset/NBA_test.csv")
saved_data = load_all_saved_models_and_player_details(2007, 2016)
predicted_outputs = generate_n_outputs(input_dataset, saved_data, 1)
print(predicted_outputs)

[{'random_forest': {'Troy Murphy': 0.5804240864655632}, 'knn': {'Troy Murphy': 0.9999997620319616}}, {'random_forest': {'Chuck Hayes': 0.580096025989145}, 'knn': {'John Lucas III': 1.0, 'Chuck Hayes': 1.0}}, {'random_forest': {'Brent Barry': 0.6090127209513871}, 'knn': {'Brent Barry': 1.0}}, {'random_forest': {'Craig Smith': 0.6654653424848411}, 'knn': {'Justin Reed': 1.0, 'Craig Smith': 1.0}}, {'random_forest': {'Jake Tsakalidis': 0.5104136044978457}, 'knn': {'Tarence Kinsey': 0.822564349530652}}, {'random_forest': {'Charlie Bell': 0.7275857174645598}, 'knn': {'Ruben Patterson': 1.0, 'Jared Reiner': 1.0, 'Julius Hodge': 1.0, 'Chris McCray': 1.0, 'Ersan Ilyasova': 1.0}}, {'random_forest': {'Dwyane Wade': 0.666114413962628}, 'knn': {'Dwyane Wade': 0.9999992044016969}}, {'random_forest': {'LeBron James': 0.5709072118908987}, 'knn': {'Anderson Varejao': 0.7230501016935682}}, {'random_forest': {'Sarunas Jasikevicius': 0.5361489907855126}, 'knn': {'Kelenna Azubuike': 1.0, 'Anthony Roberson'

In [95]:
expected_output = pd.read_csv("testing_dataset/NBA_test_labels.csv")
test_accuracy(predicted_outputs, expected_output)
print(test_accuracy_by_year(input_dataset, predicted_outputs, expected_output))

Random Forest accuracy = 53.0%
KNN accuracy = 55.8%
{np.int64(2007): {'size': 100, 'random_forest': 65, 'knn': 68, 'rf_accuracy': '65.0%', 'knn_accuracy': '68.0%'}, np.int64(2008): {'size': 100, 'random_forest': 63, 'knn': 62, 'rf_accuracy': '63.0%', 'knn_accuracy': '62.0%'}, np.int64(2009): {'size': 100, 'random_forest': 60, 'knn': 66, 'rf_accuracy': '60.0%', 'knn_accuracy': '66.0%'}, np.int64(2010): {'size': 100, 'random_forest': 66, 'knn': 71, 'rf_accuracy': '66.0%', 'knn_accuracy': '71.0%'}, np.int64(2011): {'size': 100, 'random_forest': 60, 'knn': 57, 'rf_accuracy': '60.0%', 'knn_accuracy': '57.0%'}, np.int64(2012): {'size': 100, 'random_forest': 56, 'knn': 62, 'rf_accuracy': '56.0%', 'knn_accuracy': '62.0%'}, np.int64(2013): {'size': 100, 'random_forest': 50, 'knn': 51, 'rf_accuracy': '50.0%', 'knn_accuracy': '51.0%'}, np.int64(2014): {'size': 100, 'random_forest': 53, 'knn': 60, 'rf_accuracy': '53.0%', 'knn_accuracy': '60.0%'}, np.int64(2015): {'size': 100, 'random_forest': 55, 

Predictions for 2016 data samples show very low accuracy
This is due to the unknown players playing in 2016 season who do not have any track record or have not played yet

In [99]:
input_dataset = pd.read_csv("testing_dataset/NBA_test.csv")
expected_output = pd.read_csv("testing_dataset/NBA_test_labels.csv")
saved_data = load_all_saved_models_and_player_details(2007, 2016)

# Function to find number of samples with unknown players
filtered_dataset = input_dataset.tail(100).reset_index(drop = True)
filtered_dataset['removed_value'] = expected_output.tail(100).reset_index(drop = True)['removed_value']
player_columns = ['home_0', 'home_1', 'home_2', 'home_3', 'home_4', 'away_0', 'away_1', 'away_2', 'away_3', 'away_4', 'removed_value']
for i in range(filtered_dataset.shape[0]):
    season = 2015 if int(filtered_dataset.loc[i, 'season']) == 2016 else int(filtered_dataset.loc[i, 'season'])
    for player_column in player_columns:
        if (filtered_dataset.loc[i, player_column] != '?') and (filtered_dataset.loc[i, player_column] not in saved_data[season]['player_mappings']):
            filtered_dataset.drop([i], inplace = True)
            break
filtered_dataset.reset_index(drop = True, inplace = True)
print(filtered_dataset.shape)

(33, 15)


This means 33 out of 100 player combinations are of known players

In [100]:
input_dataset = pd.concat([input_dataset.head(900), filtered_dataset[filtered_dataset.columns[:-1]]])
input_dataset.reset_index(drop = True, inplace = True)
predicted_outputs = generate_n_outputs(input_dataset, saved_data, 1)
print(predicted_outputs)

[{'random_forest': {'Troy Murphy': 0.6054112497512029}, 'knn': {'Troy Murphy': 0.9999995151847904}}, {'random_forest': {'Chuck Hayes': 0.5563748206894819}, 'knn': {'Chuck Hayes': 0.999999282551993}}, {'random_forest': {'Brent Barry': 0.554355789939431}, 'knn': {'Brent Barry': 1.0}}, {'random_forest': {'Craig Smith': 0.6781660514557802}, 'knn': {'Justin Reed': 1.0, 'Craig Smith': 1.0}}, {'random_forest': {'Pau Gasol': 0.5653664418594262}, 'knn': {'Tarence Kinsey': 0.712561243490133}}, {'random_forest': {'Charlie Bell': 0.6945989745576645}, 'knn': {'Ruben Patterson': 1.0, 'Julius Hodge': 1.0, 'Charlie Bell': 1.0}}, {'random_forest': {'Dwyane Wade': 0.6866301488282308}, 'knn': {'Dwyane Wade': 0.9999993778642526}}, {'random_forest': {'Anderson Varejao': 0.5773166479137535}, 'knn': {'Anderson Varejao': 0.6866022295075925}}, {'random_forest': {'Keith McLeod': 0.5302839600993962}, 'knn': {"Patrick O'Bryant": 1.0, 'Josh Powell': 1.0, 'Mike Dunleavy': 1.0}}, {'random_forest': {'Yakhouba Diawara

In [101]:
expected_output = pd.concat([expected_output.head(900), filtered_dataset['removed_value']])
expected_output.reset_index(drop = True, inplace = True)
test_accuracy(predicted_outputs, expected_output)
accuracy_by_year = test_accuracy_by_year(input_dataset, predicted_outputs, expected_output)

Random Forest accuracy = 51.5%
KNN accuracy = 56.7%


In [113]:
# Show predictions in tabular format
def show_table(accuracy_by_year):
    table_rows = {
        'Year': [],
        'RF Correct Predictions': [],
        'RF Accuracy': [],
        'KNN Correct Predictions': [],
        'KNN Accuracy': []
    }
    for i in accuracy_by_year:
        table_rows['Year'].append(i)
        table_rows['RF Correct Predictions'].append(str(accuracy_by_year[i]['random_forest']) + '/' + str(accuracy_by_year[i]['size']))
        table_rows['RF Accuracy'].append(str(round(((accuracy_by_year[i]['random_forest']/ accuracy_by_year[i]['size']) * 100), 2)) + ' %')
        table_rows['KNN Correct Predictions'].append(str(accuracy_by_year[i]['knn']) + '/' + str(accuracy_by_year[i]['size']))
        table_rows['KNN Accuracy'].append(str(round(((accuracy_by_year[i]['knn'] / accuracy_by_year[i]['size']) * 100), 2)) + ' %')

    output_table = pd.DataFrame(table_rows)
    return output_table.style.format(precision=3).format_index(str.upper, axis=1).hide()

In [114]:
show_table(accuracy_by_year)

YEAR,RF CORRECT PREDICTIONS,RF ACCURACY,KNN CORRECT PREDICTIONS,KNN ACCURACY
2007,63/100,63.0 %,68/100,68.0 %
2008,60/100,60.0 %,69/100,69.0 %
2009,60/100,60.0 %,56/100,56.0 %
2010,58/100,58.0 %,69/100,69.0 %
2011,56/100,56.0 %,62/100,62.0 %
2012,58/100,58.0 %,62/100,62.0 %
2013,53/100,53.0 %,52/100,52.0 %
2014,55/100,55.0 %,63/100,63.0 %
2015,52/100,52.0 %,59/100,59.0 %
2016,0/33,0.0 %,7/33,21.21 %


In [115]:
csv_data = []
for i in range(input_dataset.shape[0]):
    csv_data_row = {}
    csv_data_row['season'] = input_dataset.loc[i, 'season']
    csv_data_row['home_team'] = input_dataset.loc[i, 'home_team']
    csv_data_row['away_team'] = input_dataset.loc[i, 'away_team']
    csv_data_row['true_player'] = expected_output.loc[i, 'removed_value']
    if expected_output.loc[i, 'removed_value'] in predicted_outputs[i]['random_forest'].keys():
        csv_data_row['RF_predicted_player'] = expected_output.loc[i, 'removed_value']
    else:
        csv_data_row['RF_predicted_player'] = list(predicted_outputs[i]['random_forest'].keys())[0]
    csv_data_row['RF_confidence'] = predicted_outputs[i]['random_forest'][csv_data_row['RF_predicted_player']]
    csv_data_row['RF_is_correct'] = (csv_data_row['RF_predicted_player'] == expected_output.loc[i, 'removed_value'])
    if expected_output.loc[i, 'removed_value'] in predicted_outputs[i]['knn'].keys():
        csv_data_row['KNN_predicted_player'] = expected_output.loc[i, 'removed_value']
    else:
        csv_data_row['KNN_predicted_player'] = list(predicted_outputs[i]['knn'].keys())[0]
    csv_data_row['KNN_confidence'] = predicted_outputs[i]['knn'][csv_data_row['KNN_predicted_player']]
    csv_data_row['KNN_is_correct'] = (csv_data_row['KNN_predicted_player'] == expected_output.loc[i, 'removed_value'])
    csv_data.append(csv_data_row)

with open('single_highest_player_predictions.csv', 'w', newline='') as csvfile:
    field_names = ['season', 'home_team', 'away_team', 'true_player', 'RF_predicted_player', 'RF_confidence', 'RF_is_correct', 'KNN_predicted_player', 'KNN_confidence', 'KNN_is_correct']
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(csv_data)


Predict 3 players for 5th position in home team with highest home team win probability

In [116]:
predict_3_highest_players_outputs = generate_n_outputs(input_dataset, saved_data, 3)
print(predict_3_highest_players_outputs)

[{'random_forest': {'Troy Murphy': 0.6054112497512029, 'Orien Greene': 0.5607790651616843, 'Shawne Williams': 0.540394468092685}, 'knn': {'Troy Murphy': 0.9999995151847904, 'Jamaal Tinsley': 0.8939025074884799, 'Josh Powell': 0.8686370081537802}}, {'random_forest': {'Chuck Hayes': 0.5563748206894817, 'Dikembe Mutombo': 0.5292311663663507, 'Yao Ming': 0.5048172168623435}, 'knn': {'Chuck Hayes': 0.999999282551993, 'Dikembe Mutombo': 0.8495588563330522, 'Yao Ming': 0.7230955122348769}}, {'random_forest': {'Brent Barry': 0.554355789939431, 'Robert Horry': 0.4586447129549892, 'Francisco Elson': 0.4565467931702527}, 'knn': {'Brent Barry': 1.0, 'Melvin Ely': 0.7830810983319547, 'James White': 0.652862256030461}}, {'random_forest': {'Craig Smith': 0.6781660514557801, 'Mark Blount': 0.5957801292801009, 'Troy Hudson': 0.5678163840162509}, 'knn': {'Justin Reed': 1.0, 'Craig Smith': 1.0, 'Mark Blount': 0.9279703832399397}}, {'random_forest': {'Pau Gasol': 0.5653664418594262, 'Stromile Swift': 0.55

In [117]:
test_accuracy(predict_3_highest_players_outputs, expected_output)
accuracy_3_by_year = test_accuracy_by_year(input_dataset, predict_3_highest_players_outputs, expected_output)
show_table(accuracy_3_by_year)

Random Forest accuracy = 61.6%
KNN accuracy = 64.3%


YEAR,RF CORRECT PREDICTIONS,RF ACCURACY,KNN CORRECT PREDICTIONS,KNN ACCURACY
2007,71/100,71.0 %,75/100,75.0 %
2008,72/100,72.0 %,75/100,75.0 %
2009,68/100,68.0 %,67/100,67.0 %
2010,72/100,72.0 %,74/100,74.0 %
2011,67/100,67.0 %,68/100,68.0 %
2012,67/100,67.0 %,70/100,70.0 %
2013,62/100,62.0 %,62/100,62.0 %
2014,65/100,65.0 %,71/100,71.0 %
2015,69/100,69.0 %,70/100,70.0 %
2016,3/33,9.09 %,11/33,33.33 %


Predict 5 players for 5th position in home team with highest home team win probability

In [118]:
predict_5_highest_players_outputs = generate_n_outputs(input_dataset, saved_data, 5)
print(predict_5_highest_players_outputs)

[{'random_forest': {'Troy Murphy': 0.6054112497512029, 'Orien Greene': 0.5607790651616843, 'Shawne Williams': 0.5403944680926849, 'Al Harrington': 0.5167115809704178, 'Josh Powell': 0.5152615085039385}, 'knn': {'Troy Murphy': 0.9999995151847904, 'Jamaal Tinsley': 0.8939025074884799, 'Josh Powell': 0.8686370081537802, 'Stephen Jackson': 0.8525672819436569, 'Rawle Marshall': 0.8456262068919751}}, {'random_forest': {'Chuck Hayes': 0.5563748206894819, 'Dikembe Mutombo': 0.5292311663663507, 'Yao Ming': 0.5048172168623435, 'Shane Battier': 0.4976838849777764, 'Rafer Alston': 0.4873136373002136}, 'knn': {'Chuck Hayes': 0.999999282551993, 'Dikembe Mutombo': 0.8495588563330522, 'Yao Ming': 0.7230955122348769, 'Rafer Alston': 0.645183361181787, 'John Lucas III': 0.5926018950141988}}, {'random_forest': {'Brent Barry': 0.5543557899394309, 'Robert Horry': 0.45864471295498915, 'Francisco Elson': 0.4565467931702527, 'Melvin Ely': 0.41634424053225283, 'Jacque Vaughn': 0.4090543657071686}, 'knn': {'Bre

In [119]:
test_accuracy(predict_5_highest_players_outputs, expected_output)
accuracy_5_by_year = test_accuracy_by_year(input_dataset, predict_5_highest_players_outputs, expected_output)
show_table(accuracy_5_by_year)

Random Forest accuracy = 69.9%
KNN accuracy = 69.0%


YEAR,RF CORRECT PREDICTIONS,RF ACCURACY,KNN CORRECT PREDICTIONS,KNN ACCURACY
2007,81/100,81.0 %,80/100,80.0 %
2008,83/100,83.0 %,78/100,78.0 %
2009,80/100,80.0 %,73/100,73.0 %
2010,83/100,83.0 %,77/100,77.0 %
2011,77/100,77.0 %,76/100,76.0 %
2012,72/100,72.0 %,77/100,77.0 %
2013,67/100,67.0 %,67/100,67.0 %
2014,71/100,71.0 %,75/100,75.0 %
2015,75/100,75.0 %,73/100,73.0 %
2016,10/33,30.3 %,14/33,42.42 %
