<a href="https://colab.research.google.com/github/d-jenkins/NBA_Champs/blob/dj_branch/Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import all libraries needed
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
from bs4 import Comment
import requests
import json


In [None]:
# list holding abbviations for all nba teams to create urls
teams = ['ATL', 'NJN', 'BOS', 'CHA', 'CHI', 
         'CLE', 'DAL', 'DEN', 'DET', 'GSW', 
         'HOU', 'IND', 'LAC', 'LAL', 'MEM', 
         'MIA', 'MIL', 'MIN', 'NOH', 'NYK', 
         'OKC', 'ORL', 'PHI', 'PHO', 'POR', 
         'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

# array to hold all tables for all teams for all seasons
every_season = []

# iterate through list of all team abbreviations
for team in teams:

  # create url to scrape for team
  url = f'https://www.basketball-reference.com/teams/{team}'

  # scrape teams stats from their bball reference page
  stats = pd.read_html(url)[0]

  # select only the desired columns 
  stats = stats[["Team", "Season", "Rel Pace", "Rel ORtg", "Rel DRtg", "Playoffs"]]

  # manupulate select colums for team
  for i in range(0,len(stats["Playoffs"])):
    # remove asterisk from team
    stats["Team"][i] = stats["Team"][i].replace("*", "")

    # change playoff status to binary of if they won chip or not
    if stats["Playoffs"][i] == "Won Finals":
      stats["Playoffs"][i] = 1
    else:
      stats["Playoffs"][i] = 0

  # rename "playoffs" column to "Chip?"
  stats = stats.rename(columns={"Playoffs":"Chip?"})
  
  # Limit to all years since the 70s
  stats = stats.iloc[0:51, :]

  # add table of teams stats to a list of tables holding all teams stats
  every_season.append(stats)

# combine all teams stats into one dataframe
all_stats = pd.concat(every_season)


In [None]:
# display all teams' stats
all_stats


Unnamed: 0,Team,Season,Rel Pace,Rel ORtg,Rel DRtg,Chip?
0,Atlanta Hawks,2020-21,-1.6,3.4,1.0,0
1,Atlanta Hawks,2019-20,2.7,-3.4,4.2,0
2,Atlanta Hawks,2018-19,3.9,-2.3,3.5,0
3,Atlanta Hawks,2017-18,1.0,-3.6,2.0,0
4,Atlanta Hawks,2016-17,1.0,-3.9,-3.1,0
...,...,...,...,...,...,...
46,Washington Bullets,1974-75,1.6,0.3,-6.4,0
47,Capital Bullets,1973-74,-1.5,-2.1,-3.5,0
48,Baltimore Bullets,1972-73,-2.0,-0.5,-3.5,0
49,Baltimore Bullets,1971-72,-1.5,-1.4,-0.3,0


In [None]:
# select/adjust data to train ml model
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

X = all_stats.drop(["Team", "Season", "Chip?"], axis=1)
y = all_stats["Chip?"].values.reshape(-1, 1)

# label-encode y data
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

# One-hot encodde y data
y = to_categorical(encoded_y)

y


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [None]:
# split data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [None]:
# train random forest classifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf = rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))


0.9558823529411765


In [None]:
# weighted importance of each stat towards winning a basketball game
sorted(zip(bf.feature_importances_, X.columns), reverse=True)

[(0.45078943560245877, 'Rel ORtg'),
 (0.4172580486702902, 'Rel DRtg'),
 (0.13195251572725109, 'Rel Pace')]

In [None]:
# tune hyperparameters
from sklearn.model_selection import GridSearchCV

# create new model whose hyperparamaters are to be tuned
forest = RandomForestClassifier(random_state=42)

# store some tuning options in a dictionary
hyper = {'n_estimators': [10, 25, 50, 100, 200], 
         'max_depth': [3, 5, 8, 15]}

# create GridSearch model
grid = GridSearchCV(forest, hyper, cv = 3, verbose = 1, n_jobs = -1)

# Train the model with GridSearch
fitted = grid.fit(X_train_scaled, y_train)

# display best parameters and the score they get
print(fitted.best_params_)
print(fitted.best_score_)


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    6.1s


{'max_depth': 3, 'n_estimators': 10}
0.9646480421076985


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    8.1s finished


In [None]:
best_forest = RandomForestClassifier(max_depth=3, n_estimators=10, random_state=42)
bf = best_forest.fit(X_train_scaled, y_train)
# **(why is score different than the "fitted" score in the cell above)
print(bf.score(X_test_scaled, y_test))


0.95


**ETL**

In [None]:
import os
# Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
spark_version = 'spark-3.1.1'

os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [1 InRelease 0 B/88.7 kB 0% [Waiting for headers] [Connecting to cloud.r-project.org] [Waiting for heade0% [1 InRelease gpgv 88.7 kB] [Waiting for headers] [Connecting to cloud.r-proj                                                                               Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 88.7 kB] [Waiting for headers] [Connecting to cloud.r-proj                                                                               Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [1 InRelease gpgv 88.7 kB] [Connecting to cloud.r-project.org] [Waiting for                                                                                Get:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
0% [1 InRelease gpg

In [None]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NbaChamps").getOrCreate()