<a href="https://colab.research.google.com/github/d-jenkins/NBA_Champs/blob/main/Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# import all libraries needed
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
from bs4 import Comment
import requests
import json


In [28]:
# list holding abbviations for all nba teams to create urls
teams = ['ATL', 'NJN', 'BOS', 'CHA', 'CHI', 
         'CLE', 'DAL', 'DEN', 'DET', 'GSW', 
         'HOU', 'IND', 'LAC', 'LAL', 'MEM', 
         'MIA', 'MIL', 'MIN', 'NOH', 'NYK', 
         'OKC', 'ORL', 'PHI', 'PHO', 'POR', 
         'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

# array to hold all tables for all teams for all seasons
every_season = []

# iterate through list of all team abbreviations
for team in teams:

  print(team)

  # create url to scrape for team
  url = f'https://www.basketball-reference.com/teams/{team}'

  # scrape teams stats from their bball reference page
  stats = pd.read_html(url)[0]

  # select only the desired columns 
  stats = stats[["Team", "Season", "Rel Pace", "Rel ORtg", "Rel DRtg", "Playoffs"]]

  # select only seasons since 80s except for 2020-21 and reset index
  stats = stats.iloc[1:42, :].reset_index(drop=True)

  # remove asterisk from team name
  def rename(team): return(team.replace("*", ""))
  stats["Team"] = stats["Team"].apply(rename)


  categories = ['Age', 'Ht.', 'Wt.', 'FGA', 
                'FG%', '3PA', '3P%', '2PA', 
                '2P%', 'FTA', 'FT%', 'ORB', 
                'DRB', 'AST', 'STL', 'BLK', 
                'TOV', 'PF', 'PTS']



  url = f'https://www.basketball-reference.com/teams/{team}/stats_basic_totals.html'

  playoffs = {np.nan: 0, 
              'Lost E. Conf. 1st Rnd.': 1,
              'Lost W. Conf. 1st Rnd.': 1,
              'Won E. Conf. 1st Rnd.' : 2,
              'Won W. Conf. 1st Rnd.' : 2,
              'Lost Quarterfinals': 2,
              'Lost E. Conf. Semis': 2,
              'Lost E. Div. Semis': 2,
              'Lost W. Conf. Semis': 2,  
              'Lost W. Div. Semis': 2, 
              'Lost E. Conf. Finals': 3,
              'Lost E. Div. Finals': 3,
              'Lost W. Conf. Finals': 3,
              'Lost W. Div. Finals': 3, 
              'Lost Finals': 4, 
              'Won Finals': 5}

  stats["Playoffs"] = stats["Playoffs"].map(playoffs)

  more_stats = pd.read_html(url)[0][categories]

  # select only seasons since 80s except for 2020-21
  more_stats = more_stats.iloc[1:43, :]

  # remove dumb rows that restate stat categories and set index
  more_stats = more_stats.loc[more_stats['PTS'] != 'PTS', :].reset_index(drop=True)

  more_stats['Ht.'] = more_stats['Ht.'].apply(lambda h: round(int(h.replace('6-', ''))/12 + 6, 3))

  stats = pd.merge(stats, more_stats, left_index=True, right_index=True)


  # add table of teams stats to a list of tables holding all teams stats
  every_season.append(stats)

# combine all teams stats into one dataframe
all_stats = pd.concat(every_season).reset_index(drop=True)



ATL
NJN
BOS
CHA
CHI
CLE
DAL
DEN
DET
GSW
HOU
IND
LAC
LAL
MEM
MIA
MIL
MIN
NOH
NYK
OKC
ORL
PHI
PHO
POR
SAC
SAS
TOR
UTA
WAS


In [60]:
seasons = all_stats["Season"].unique()

all_rel_stats = all_stats.copy()
all_rel_stats.set_index('Season', inplace=True)

for season in seasons:
  for category in categories:
    avg = round(pd.to_numeric(all_stats.loc[all_stats["Season"] == season, :][category]).mean(), 3)
    all_rel_stats.loc[season, category] = all_rel_stats.loc[season, category].apply(lambda x: float(x)- avg)
    # print(f'{season} {category}: {avg}')

all_rel_stats



Unnamed: 0_level_0,Team,Rel Pace,Rel ORtg,Rel DRtg,Playoffs,Age,Ht.,Wt.,FGA,FG%,3PA,3P%,2PA,2P%,FTA,FT%,ORB,DRB,AST,STL,BLK,TOV,PF,PTS
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2019-20,Atlanta Hawks,2.7,-3.4,4.2,0,-1.92,0.017,-1.733,-203.533,-0.011,7.6,-0.025,-211.133,0.001,-65.433,0.018,-50.333,-216.9,-116.633,-17,-4.933,59.267,81.2,-404.933
2018-19,Atlanta Hawks,3.9,-2.3,3.5,0,-1.177,0.036,-2.733,208.733,-0.009,409.267,-0.003,-200.533,-0.002,26.067,-0.015,106.533,-30.1,101.9,49.033,12.833,242.2,217.833,174.9
2017-18,Atlanta Hawks,1.0,-3.6,2.0,0,-1.047,-0.041,-6.2,-41.967,-0.014,166,-0.002,-207.967,-0.016,-123.5,0.018,-53.367,-79.033,40.567,5.233,-46.867,106.267,-21.9,-244.333
2016-17,Atlanta Hawks,1.0,-3.9,-3.1,1,1.313,-0.044,-0.8,-85.833,-0.006,-77.067,-0.016,-8.767,-0.003,143.833,-0.044,10.767,56.033,82.667,40.333,8.033,150,-140.667,-199.433
2015-16,Atlanta Hawks,1.3,-1.3,-5.0,2,1.563,-0.044,-3.333,-11.967,0.006,351.3,-0.003,-363.267,0.02,-277.633,0.025,-175.133,37.967,272.267,103.567,79.567,46.533,-91.8,13.933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1984-85,Washington Bullets,-1.8,-3.6,-3.3,1,0.652,-0.029,3.043,76.565,-0.012,140.739,0,-64.174,-0.009,-418.87,-0.02,-161.87,3.304,-64.826,8.043,-42.609,-182.957,-176.043,-433.739
1983-84,Washington Bullets,-4.0,-3.4,-0.4,1,-0.183,-0.029,9.783,-338.13,-0.008,87.043,0.01,-425.174,-0.005,-233.435,-0.003,-137,22.739,44.304,-141.435,-114.957,-19.609,-122.87,-606.043
1982-83,Washington Bullets,-3.7,-5.6,-5.4,0,-0.3,-0.022,10.043,-293.087,-0.017,52.304,0.064,-345.391,-0.018,-259.957,-0.034,-117.087,0.304,-78.435,3.522,-59.391,21.13,-145.174,-764.174
1981-82,Washington Bullets,-1.5,-3.6,-4.4,2,-0.161,-0.011,7.304,-67.565,-0.017,48.696,-0.003,-116.261,-0.015,-238,0.027,-128.609,193.957,-79.522,-57.304,-43.478,-63.696,-73.739,-418.304


In [None]:
# select/adjust data to train ml model
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

X = all_stats.drop(["Team", "Season", "Chip?"], axis=1)
y = all_stats["Chip?"].values.reshape(-1, 1)

# label-encode y data
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

# One-hot encodde y data
y = to_categorical(encoded_y)

y


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [None]:
# split data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [None]:
# train random forest classifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf = rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))


0.9558823529411765


In [None]:
# weighted importance of each stat towards winning a basketball game
sorted(zip(bf.feature_importances_, X.columns), reverse=True)

[(0.45078943560245877, 'Rel ORtg'),
 (0.4172580486702902, 'Rel DRtg'),
 (0.13195251572725109, 'Rel Pace')]

In [None]:
# create array to hold binary model predictions
predictions = []

# convert hot encoded preditions back to binary values
for pred in rf.predict(X):
  if pred[1] == 0.:
    predictions.append(0)
  else:
    predictions.append(1)

# create new dataframe to hold stats with predictions
champs = all_stats
champs["Predicted"] = predictions

# show dataframe with all stats and championship predictions
champs

Unnamed: 0,Team,Season,Rel Pace,Rel ORtg,Rel DRtg,Chip?,Predicted
0,Atlanta Hawks,2020-21,-1.6,3.4,1.0,0,0
1,Atlanta Hawks,2019-20,2.7,-3.4,4.2,0,0
2,Atlanta Hawks,2018-19,3.9,-2.3,3.5,0,0
3,Atlanta Hawks,2017-18,1.0,-3.6,2.0,0,0
4,Atlanta Hawks,2016-17,1.0,-3.9,-3.1,0,0
...,...,...,...,...,...,...,...
46,Washington Bullets,1974-75,1.6,0.3,-6.4,0,0
47,Capital Bullets,1973-74,-1.5,-2.1,-3.5,0,0
48,Baltimore Bullets,1972-73,-2.0,-0.5,-3.5,0,0
49,Baltimore Bullets,1971-72,-1.5,-1.4,-0.3,0,0


In [None]:
# create lists to hold overachieving and disappointing teams
overachievers = []
disappointments = []

# iterate through all seasons for all teams
for i in range(0, len(champs["Season"])):

  # if model said a team wouldnt win the chip but they do then add them to overachievers
  if (champs["Chip?"].values[i] == 1) and (champs["Predicted"].values[i] == 0):
    overachievers.append(f'{champs["Season"].values[i]} {champs["Team"].values[i]}')

  # if model said a team would win the chip but the don't then add them to disappointments
  elif (champs["Chip?"].values[i] == 0) and (champs["Predicted"].values[i] == 1):
    disappointments.append(f'{champs["Season"].values[i]} {champs["Team"].values[i]}')

# show both lists
print(overachievers)
print(disappointments)


['1975-76 New York Nets', '2007-08 Boston Celtics', '1975-76 Boston Celtics', '2015-16 Cleveland Cavaliers', '2010-11 Dallas Mavericks', '1988-89 Detroit Pistons', '2016-17 Golden State Warriors', '1987-88 Los Angeles Lakers', '1972-73 New York Knicks', '1978-79 Seattle SuperSonics', '2004-05 San Antonio Spurs', '1998-99 San Antonio Spurs', '2018-19 Toronto Raptors', '1977-78 Washington Bullets']
['1971-72 Milwaukee Bucks']


In [None]:
type(0)

int

In [None]:
# # tune hyperparameters
# from sklearn.model_selection import GridSearchCV

# # create new model whose hyperparamaters are to be tuned
# forest = RandomForestClassifier(random_state=42)

# # store some tuning options in a dictionary
# hyper = {'n_estimators': [10, 25, 50, 100, 200], 
#          'max_depth': [3, 5, 8, 15]}

# # create GridSearch model
# grid = GridSearchCV(forest, hyper, cv = 3, verbose = 1, n_jobs = -1)

# # Train the model with GridSearch
# fitted = grid.fit(X_train_scaled, y_train)

# # display best parameters and the score they get
# print(fitted.best_params_)
# print(fitted.best_score_)


In [None]:
# best_forest = RandomForestClassifier(max_depth=3, n_estimators=10, random_state=42)
# bf = best_forest.fit(X_train_scaled, y_train)
# # **(why is score different than the "fitted" score in the cell above)
# print(bf.score(X_test_scaled, y_test))


In [None]:
#import pandas as pd
#import numpy as np
#import matplotlib.pyplot as plt

#from matplotlib import style
#style.use("ggplot")
#from matplotlib import rcParams
#rcParams['figure.figsize'] = 10, 8

In [None]:
#df = pd.dataFrame(os.path.join("..", "Chip?", "Season", "Team"))
#df.head()

In [None]:
#target = df["Chip?"]
#target_names = ["Season", "team"]

In [None]:
#data = df.drop("Chip?", axis=1)
#feature_names = data.columns
#data.head()

In [None]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
#from sklearn.svm import SVC 
#model = SVC(kernel='linear')
#model.fit(X_train, y_train)

In [None]:
#print('Test Acc: %.3f' % model.score(X_test, y_test))

In [None]:
#from sklearn.metrics import classification_report
#predictions = model.predict(X_test)
#print(classification_report(y_test, predictions,
                            #target_names=target_names))

**ETL**

In [None]:
import os
# Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
spark_version = 'spark-3.1.1'

os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [697 B]
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Get:7 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Hit:9 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:10 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ Packages [61.8 kB]
Get:11 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:13 htt

In [None]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2021-06-10 00:15:45--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar’


2021-06-10 00:15:45 (4.33 MB/s) - ‘postgresql-42.2.9.jar’ saved [914037/914037]



In [None]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NbaChamps").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [None]:
all_stats.dtypes

Team         object
Season       object
Rel Pace    float64
Rel ORtg    float64
Rel DRtg    float64
Chip?        object
dtype: object

In [None]:
# Rename Columns for postgres consumption
all_stats_copy = all_stats.copy()
all_stats_py = all_stats_copy.rename(columns={"Rel Pace": "Rel_Pace", "Rel ORtg": "Rel_ORtg", "Rel DRtg": "Rel_DRtg", "Chip?": "Chip"})

# Convert Pandas df to Pyspark df
all_stats_py = spark.createDataFrame(all_stats_py)
all_stats_py.printSchema()
# all_stats_py

root
 |-- Team: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- Rel_Pace: double (nullable = true)
 |-- Rel_ORtg: double (nullable = true)
 |-- Rel_DRtg: double (nullable = true)
 |-- Chip: long (nullable = true)



In [None]:
# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://nba-champs.c6ka6apltccn.us-east-2.rds.amazonaws.com:5432/nbaChamps"
config = {"user":"postgres",
          "password": "MildredChase84!",
          "driver":"org.postgresql.Driver"}

In [None]:
# Write DataFrame to Nba champs table in RDS

all_stats_py.write.jdbc(url=jdbc_url, table='all_stats', mode=mode, properties=config)