Gabriel Marcelino
September 2024
Artificial Neural Network (ANN)

## Import dependencies and load data

In [80]:
import sys
print(sys.path)


['C:\\Program Files\\JetBrains\\PyCharm 2023.3.3\\plugins\\python\\helpers-pro\\jupyter_debug', 'C:\\Program Files\\JetBrains\\PyCharm 2023.3.3\\plugins\\python\\helpers\\pydev', 'C:\\Users\\grant\\PycharmProjects\\neural-networks', 'C:\\Users\\grant\\AppData\\Local\\Programs\\Python\\Python312\\python312.zip', 'C:\\Users\\grant\\AppData\\Local\\Programs\\Python\\Python312\\DLLs', 'C:\\Users\\grant\\AppData\\Local\\Programs\\Python\\Python312\\Lib', 'C:\\Users\\grant\\AppData\\Local\\Programs\\Python\\Python312', '', 'C:\\Users\\grant\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages', 'C:\\Users\\grant\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\win32', 'C:\\Users\\grant\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\win32\\lib', 'C:\\Users\\grant\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\Pythonwin', 'C:\\Users\\grant\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\setuptools\\_vendor']


In [81]:
import csv
import random
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

pool = []
training_data = []
# create pool with players from 2019-2022
with open('all_seasons.csv', mode = 'r') as file:
    csvFile = csv.reader(file)
    # ignore first line
    next(csvFile)
    for lines in csvFile:
        year = int(lines[21][:4])
        if 2018 < year < 2023 and len(pool) < 100 and lines[1] not in pool:
            pool.append(lines)
        elif len(training_data) < 5000:
            training_data.append(lines)


## Optimal Team : Considerations
For my optimal team, I will aim for:
- 2 or more players in top 20% of shooting percentage.
- Player in the top 5% out of the 100 for rebounds.
- Player with a Defensive Rebound percentage bigger than 0.2
- 3 or more players in top 20% of best net rating
- 2 or more players with better than average assists

## Model

In [82]:
def extract_features(pool):
    features_list = []
    for player in pool:
        # extract relevant features based on considerations above
        features = {
            'name': player[1],
            'ts_pct': player[19],
            'reb': player[13],
            'dreb_pct': player[17],
            'rating': player[15],
            'ast': player[14]
        }
        features_list.append(features)
    return features_list


## Simulate Training Data to train model

In [83]:


def simulate_data(sample, num_iter=10000):
    X = []
    y = []
    # calculate average assists
    assists = np.array([float(player[14]) for player in sample])
    average_assists = np.mean(assists)
    # calculate top 20% of shooting percentage
    top_20_ts = np.percentile([float(player[19]) for player in sample], 80)
    # calculate top 10% of rebound
    top_10_reb = np.percentile([float(player[13]) for player in sample], 90)
    # calculate top 20% net rating
    top_20_rating = np.percentile([float(player[15]) for player in sample], 80)

    for i in range(num_iter):
        # select 5 random players from list
        selected_players = random.sample(sample, 5)
        features = extract_features(selected_players)
        X.append(features)
        """
        - 2 or more players in top 20% of shooting percentage.
        - Player in the top 10% for rebounds.
        - Player with a Defensive Rebound percentage bigger than 0.2
        - 3 or more players in top 20% of best net rating
        - 2 or more players with better than average assists
        """
        label = 0
        # check if there are 2 players with better than average assists
        players_ast = [player for player in features if float(player['ast']) > average_assists]
        if len(players_ast) >= 3:
            # check if 2 or more players in top 20% of shooting percentage
            players_ts = [player for player in features if float(player['ts_pct']) > top_20_ts]
            if len(players_ts) >= 2:
                # check if any player is in the top 10% for rebounds
                players_reb = [player for player in features if float(player['reb']) > 1]
                if len(players_reb) >=5:
                    # check if any player on team has dreb pct > 0.2
                    players_dreb = [player for player in features if float(player['dreb_pct'])>0.2]
                    if len(players_dreb) >=2:
                        # check if 3 or more players in top 20% of net rating
                        players_rating = [player for player in features if float(player['rating']) > top_20_rating]
                        if len(players_rating) >= 3:
                            # Optimal Team Found
                            label = 1
        y.append(label)                   
    X = np.array(X)
    y = np.array(y)
    
    # output shapes
    print(f"Shape of X: {X.shape}")
    print(f"Shape of y: {y.shape}")
    
    # Output number of y = 1 occurances and output
    count_ones = np.sum(y == 1)
    print(f"Number of 1's in y: {count_ones}")

    return X, y




## Train Model
Now that we have the training data, we can train the model.

In [84]:
X_train, y_train = simulate_data(training_data)

# Extract numerical features from dictionaries
X_train = np.array([
    [
        float(player['ts_pct']),
        float(player['reb']),
        float(player['dreb_pct']),
        float(player['rating']),
        float(player['ast'])
    ]
    for team in X_train
    for player in team
]).reshape(len(X_train), -1)

# Build the model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)
# TO-DO: SPLIT DATA FOR TESTING?

Shape of X: (10000, 5)
Shape of y: (10000,)
Number of 1's in y: 17
Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 927us/step - accuracy: 0.9918 - loss: 0.0387
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 767us/step - accuracy: 0.9985 - loss: 0.0101
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 764us/step - accuracy: 0.9981 - loss: 0.0116
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 799us/step - accuracy: 0.9983 - loss: 0.0100
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 759us/step - accuracy: 0.9979 - loss: 0.0097
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 754us/step - accuracy: 0.9979 - loss: 0.0082
Epoch 7/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 753us/step - accuracy: 0.9988 - loss: 0.0056
Epoch 8/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75

<keras.src.callbacks.history.History at 0x26363fc3950>

TO DO:

Explain your architecture and how the basketball player characteristics are used as inputs:

Data Splitting for Testing:
Split the data and try to predict the testing data



Only 100 ish positives out of the sample with 10000 when simulating data
Maybe change requirements for optimal team so one can be more commom ?

Double check algorithm and outputs

Expand documentation

Interpret the output of your MLP in the context of selecting an optimal basketball team:

