<a href="https://colab.research.google.com/github/d-jenkins/NBA_Champs/blob/main/Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
# import all libraries needed
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
from bs4 import Comment
import requests
import json


In [None]:
# list holding abbviations for all nba teams to create urls
teams = ['ATL', 'NJN', 'BOS', 'CHA', 'CHI', 
         'CLE', 'DAL', 'DEN', 'DET', 'GSW', 
         'HOU', 'IND', 'LAC', 'LAL', 'MEM', 
         'MIA', 'MIL', 'MIN', 'NOH', 'NYK', 
         'OKC', 'ORL', 'PHI', 'PHO', 'POR', 
         'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

# array to hold all tables for all teams for all seasons
every_season = []

# iterate through list of all team abbreviations
for team in teams:

  # create url to scrape for team
  url = f'https://www.basketball-reference.com/teams/{team}'

  # scrape team's stats from their bball reference page
  stats = pd.read_html(url)[0]

  # select only the desired columns 
  stats = stats[["Team", "Season", "Rel Pace", "Rel ORtg", "Rel DRtg", "Playoffs"]]

  # numbers that correspond to playoff results
  playoffs = {np.nan: 0, 
              'Lost E. Conf. 1st Rnd.': 1,
              'Lost W. Conf. 1st Rnd.': 1,
              'Won E. Conf. 1st Rnd.' : 2,
              'Won W. Conf. 1st Rnd.' : 2,
              'Lost Quarterfinals': 2,
              'Lost E. Conf. Semis': 2,
              'Lost E. Div. Semis': 2,
              'Lost W. Conf. Semis': 2,  
              'Lost W. Div. Semis': 2, 
              'Lost E. Conf. Finals': 3,
              'Lost E. Div. Finals': 3,
              'Lost W. Conf. Finals': 3,
              'Lost W. Div. Finals': 3, 
              'Lost Finals': 4, 
              'Won Finals': 5}

  # covert playoff results to numerical values
  stats["Playoffs"] = stats["Playoffs"].map(playoffs)

  # select only seasons since 80s except for 2020-21 and reset index
  stats = stats.iloc[1:42, :].reset_index(drop=True)

  # remove asterisk from team name
  def rename(team): return(team.replace("*", ""))
  stats["Team"] = stats["Team"].apply(rename)



  # create url to scrape other table for team
  url = f'https://www.basketball-reference.com/teams/{team}/stats_basic_totals.html'

  # desired statistical categories
  categories = ['Age', 'Ht.', 'Wt.', 'FGA', 
                'FG%', '3PA', '3P%', '2PA', 
                '2P%', 'FTA', 'FT%', 'ORB', 
                'DRB', 'AST', 'STL', 'BLK', 
                'TOV', 'PF', 'PTS']
  
  # scrape more of team's stats from their bball reference page
  more_stats = pd.read_html(url)[0][categories]

  # select only seasons since 80s except for 2020-21
  more_stats = more_stats.iloc[1:43, :]

  # remove dumb rows that restate stat categories and set index
  more_stats = more_stats.loc[more_stats['PTS'] != 'PTS', :].reset_index(drop=True)

  # convert heights to numerical values
  more_stats['Ht.'] = more_stats['Ht.'].apply(lambda h: round(int(h.replace('6-', ''))/12 + 6, 3))



  # merge both stat dataframes
  stats = pd.merge(stats, more_stats, left_index=True, right_index=True)

  # add table of team's stats to a list of tables holding all teams' stats
  every_season.append(stats)



# combine all teams stats into one dataframe
all_stats = pd.concat(every_season).reset_index(drop=True)



In [None]:
# create list to hold all seasons
seasons = all_stats["Season"].unique()

# create new data frame to hold all stats relative the the average of that season
all_rel_stats = all_stats.copy()
all_rel_stats.set_index('Season', inplace=True)

# calculate relative values for all stats
for season in seasons:
  for category in categories:
    avg = round(pd.to_numeric(all_stats.loc[all_stats["Season"] == season, :][category]).mean(), 3)
    all_rel_stats.loc[season, category] = all_rel_stats.loc[season, category].apply(lambda x: float(x)- avg)
    # print(f'{season} {category}: {avg}')

# show resulting dataframe
all_rel_stats


Unnamed: 0_level_0,Team,Rel Pace,Rel ORtg,Rel DRtg,Playoffs,Age,Ht.,Wt.,FGA,FG%,3PA,3P%,2PA,2P%,FTA,FT%,ORB,DRB,AST,STL,BLK,TOV,PF,PTS
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2019-20,Atlanta Hawks,2.7,-3.4,4.2,0,-1.92,0.017,-1.733,-203.533,-0.011,7.6,-0.025,-211.133,0.001,-65.433,0.018,-50.333,-216.9,-116.633,-17,-4.933,59.267,81.2,-404.933
2018-19,Atlanta Hawks,3.9,-2.3,3.5,0,-1.177,0.036,-2.733,208.733,-0.009,409.267,-0.003,-200.533,-0.002,26.067,-0.015,106.533,-30.1,101.9,49.033,12.833,242.2,217.833,174.9
2017-18,Atlanta Hawks,1.0,-3.6,2.0,0,-1.047,-0.041,-6.2,-41.967,-0.014,166,-0.002,-207.967,-0.016,-123.5,0.018,-53.367,-79.033,40.567,5.233,-46.867,106.267,-21.9,-244.333
2016-17,Atlanta Hawks,1.0,-3.9,-3.1,1,1.313,-0.044,-0.8,-85.833,-0.006,-77.067,-0.016,-8.767,-0.003,143.833,-0.044,10.767,56.033,82.667,40.333,8.033,150,-140.667,-199.433
2015-16,Atlanta Hawks,1.3,-1.3,-5.0,2,1.563,-0.044,-3.333,-11.967,0.006,351.3,-0.003,-363.267,0.02,-277.633,0.025,-175.133,37.967,272.267,103.567,79.567,46.533,-91.8,13.933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1984-85,Washington Bullets,-1.8,-3.6,-3.3,1,0.652,-0.029,3.043,76.565,-0.012,140.739,0,-64.174,-0.009,-418.87,-0.02,-161.87,3.304,-64.826,8.043,-42.609,-182.957,-176.043,-433.739
1983-84,Washington Bullets,-4.0,-3.4,-0.4,1,-0.183,-0.029,9.783,-338.13,-0.008,87.043,0.01,-425.174,-0.005,-233.435,-0.003,-137,22.739,44.304,-141.435,-114.957,-19.609,-122.87,-606.043
1982-83,Washington Bullets,-3.7,-5.6,-5.4,0,-0.3,-0.022,10.043,-293.087,-0.017,52.304,0.064,-345.391,-0.018,-259.957,-0.034,-117.087,0.304,-78.435,3.522,-59.391,21.13,-145.174,-764.174
1981-82,Washington Bullets,-1.5,-3.6,-4.4,2,-0.161,-0.011,7.304,-67.565,-0.017,48.696,-0.003,-116.261,-0.015,-238,0.027,-128.609,193.957,-79.522,-57.304,-43.478,-63.696,-73.739,-418.304


In [33]:
# select/adjust data to train ml model
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# uncomment below to use only data for playoff teams
rel_playoffs = all_rel_stats.loc[all_rel_stats['Playoffs'] > 0, :]
X = rel_playoffs.drop(["Team", "Playoffs"], axis=1)
y = rel_playoffs["Playoffs"].values.reshape(-1, 1)

# uncomment to use all data
# X = all_rel_stats.drop(["Team", "Playoffs"], axis=1)
# y = all_rel_stats["Playoffs"].values.reshape(-1, 1)

# uncomment to one hot encode data
# # label-encode y data
# label_encoder = LabelEncoder()
# label_encoder.fit(y)
# encoded_y = label_encoder.transform(y)
# # One-hot encode y data
# y = to_categorical(encoded_y)






In [34]:
# split data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [45]:
from sklearn.svm import SVC

# train model
clf = SVC()
clf.fit(X_train, y_train)

# test model
this = clf.predict(X)
clf.score(X_test, y_test)


1
2
3
5


  y = column_or_1d(y, warn=True)


0.4585987261146497

In [41]:
# create dataframe to only hold data for playoff teams
champs = all_rel_stats.copy().loc[all_rel_stats['Playoffs'] > 0, :]

# add column to hold playoff success predicted by model
champs['Predicted'] = this

# add column with how wrong the models prediction was
champs['Error'] = champs['Playoffs'].values - champs['Predicted'].values


Unnamed: 0_level_0,Team,Predicted,Playoffs,Error
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-19,Toronto Raptors,1,5,4
1980-81,Boston Celtics,1,5,4
2000-01,Los Angeles Lakers,1,5,4
2001-02,Los Angeles Lakers,1,5,4
2008-09,Los Angeles Lakers,1,5,4
2009-10,Los Angeles Lakers,1,5,4
2003-04,Detroit Pistons,1,5,4
2012-13,Miami Heat,1,5,4
2011-12,Miami Heat,1,5,4
2010-11,Dallas Mavericks,1,5,4


In [None]:
# most overachieving teams
champs.sort_values('Error', ascending=False)[['Team', 'Predicted', 'Playoffs', 'Error']].head(20)


In [42]:
# most disappointing teams
champs.sort_values('Error')[['Team', 'Predicted', 'Playoffs', 'Error']].head(20)


Unnamed: 0_level_0,Team,Predicted,Playoffs,Error
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007-08,Phoenix Suns,5,1,-4
1985-86,Los Angeles Lakers,5,3,-2
1982-83,Los Angeles Lakers,5,4,-1
1983-84,Los Angeles Lakers,5,4,-1
2012-13,San Antonio Spurs,5,4,-1
2018-19,Golden State Warriors,5,4,-1
2000-01,Indiana Pacers,1,1,0
2001-02,Indiana Pacers,1,1,0
2002-03,Indiana Pacers,1,1,0
1998-99,Phoenix Suns,1,1,0


In [46]:
# Correctly predicted
champs.loc[champs['Error'] == 0, :].sort_values('Playoffs', ascending=False)[['Team', 'Predicted', 'Playoffs', 'Error']].head(20)


Unnamed: 0_level_0,Team,Predicted,Playoffs,Error
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1996-97,Chicago Bulls,5,5,0
2017-18,Golden State Warriors,5,5,0
2016-17,Golden State Warriors,5,5,0
1986-87,Los Angeles Lakers,5,5,0
1984-85,Los Angeles Lakers,5,5,0
1985-86,Boston Celtics,5,5,0
2013-14,San Antonio Spurs,5,5,0
2018-19,Milwaukee Bucks,3,3,0
1987-88,Boston Celtics,3,3,0
1994-95,Phoenix Suns,2,2,0


In [60]:
# list holding abbviations for all nba teams to create urls
teams = ['ATL', 'NJN', 'BOS', 'CHA', 'CHI', 
         'CLE', 'DAL', 'DEN', 'DET', 'GSW', 
         'HOU', 'IND', 'LAC', 'LAL', 'MEM', 
         'MIA', 'MIL', 'MIN', 'NOH', 'NYK', 
         'OKC', 'ORL', 'PHI', 'PHO', 'POR', 
         'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

# array to hold all tables for all teams for all seasons
every_season = []

# iterate through list of all team abbreviations
for team in teams:

  # create url to scrape for team
  url = f'https://www.basketball-reference.com/teams/{team}'

  # scrape team's stats from their bball reference page
  stats = pd.read_html(url)[0]

  # select only the desired columns 
  stats = stats[["Team", "Season", "Rel Pace", "Rel ORtg", "Rel DRtg", "Playoffs"]]

  # numbers that correspond to playoff results
  playoffs = {np.nan: 0, 
              'Lost E. Conf. 1st Rnd.': 1,
              'Lost W. Conf. 1st Rnd.': 1,
              'Won E. Conf. 1st Rnd.' : 2,
              'Won W. Conf. 1st Rnd.' : 2,
              'Lost Quarterfinals': 2,
              'Lost E. Conf. Semis': 2,
              'Lost E. Div. Semis': 2,
              'Lost W. Conf. Semis': 2,  
              'Lost W. Div. Semis': 2, 
              'Lost E. Conf. Finals': 3,
              'Lost E. Div. Finals': 3,
              'Lost W. Conf. Finals': 3,
              'Lost W. Div. Finals': 3, 
              'Lost Finals': 4, 
              'Won Finals': 5}

  # covert playoff results to numerical values
  stats["Playoffs"] = stats["Playoffs"].map(playoffs)

  # select only seasons since 80s except for 2020-21 and reset index
  stats = stats.iloc[0:1, :].reset_index(drop=True)

  # remove asterisk from team name
  def rename(team): return(team.replace("*", ""))
  stats["Team"] = stats["Team"].apply(rename)



  # create url to scrape other table for team
  url = f'https://www.basketball-reference.com/teams/{team}/stats_basic_totals.html'

  # desired statistical categories
  categories = ['Age', 'Ht.', 'Wt.', 'FGA', 
                'FG%', '3PA', '3P%', '2PA', 
                '2P%', 'FTA', 'FT%', 'ORB', 
                'DRB', 'AST', 'STL', 'BLK', 
                'TOV', 'PF', 'PTS']
  
  # scrape more of team's stats from their bball reference page
  more_stats = pd.read_html(url)[0][categories]

  # select only seasons since 80s except for 2020-21
  more_stats = more_stats.iloc[0:1, :]

  # remove dumb rows that restate stat categories and set index
  more_stats = more_stats.loc[more_stats['PTS'] != 'PTS', :].reset_index(drop=True)

  # convert heights to numerical values
  more_stats['Ht.'] = more_stats['Ht.'].apply(lambda h: round(int(h.replace('6-', ''))/12 + 6, 3))



  # merge both stat dataframes
  stats = pd.merge(stats, more_stats, left_index=True, right_index=True)

  # add table of team's stats to a list of tables holding all teams' stats
  every_season.append(stats)



# combine all teams stats into one dataframe
this_year = pd.concat(every_season).reset_index(drop=True)




# create list to hold all seasons
seasons = this_year["Season"].unique()

# create new data frame to hold all stats relative the the average of that season
this_year_rel = this_year.copy()
this_year_rel.set_index('Season', inplace=True)

# calculate relative values for all stats

for category in categories:
  avg = round(pd.to_numeric(this_year.loc[this_year["Season"] == '2020-21', :][category]).mean(), 3)
  this_year_rel.loc['2020-21', category] = this_year_rel.loc['2020-21', category].apply(lambda x: float(x)- avg)
  # print(f'{season} {category}: {avg}')

# show resulting dataframe
this_year_rel



Unnamed: 0_level_0,Team,Rel Pace,Rel ORtg,Rel DRtg,Playoffs,Age,Ht.,Wt.,FGA,FG%,3PA,3P%,2PA,2P%,FTA,FT%,ORB,DRB,AST,STL,BLK,TOV,PF,PTS
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2020-21,Atlanta Hawks,-1.6,3.4,1.0,2.0,-0.693,0.011,-3.0,-85.1,0.002,-92.067,0.007,6.967,-0.005,173.833,0.033,52.267,43.2,-48.9,-42.2,-8.833,-43.233,3.033,115.433
2020-21,Brooklyn Nets,0.3,6.0,1.5,2.0,2.107,0.011,2.0,-77.1,0.028,105.933,0.026,-183.033,0.034,51.833,0.025,-67.733,77.2,143.1,-61.2,28.167,-21.233,-17.967,466.433
2020-21,Boston Celtics,-0.9,1.7,0.2,1.0,-0.993,-0.072,3.0,34.9,0.0,123.933,0.008,-89.033,-0.001,-75.167,-0.004,57.267,-60.8,-96.9,10.8,32.167,15.767,82.033,38.433
2020-21,Charlotte Hornets,-0.9,-1.4,0.5,0.0,-1.493,-0.072,-4.0,-42.1,-0.011,171.933,0.003,-214.033,-0.014,-66.167,-0.018,54.267,-92.8,147.1,19.8,-6.833,72.767,-90.967,-189.567
2020-21,Chicago Bulls,-0.2,-1.2,-0.3,0.0,-0.493,0.094,1.0,13.9,0.01,-48.067,0.004,61.967,0.011,-313.167,0.012,-14.733,62.2,141.1,-63.2,-46.833,92.767,-26.967,-101.567
2020-21,Cleveland Cavaliers,-1.9,-6.5,2.1,0.0,-2.093,0.011,6.0,-191.1,-0.016,-353.067,-0.03,161.967,-0.021,42.833,-0.036,43.267,-154.8,-69.9,13.8,-25.833,117.767,-80.967,-594.567
2020-21,Dallas Mavericks,-1.9,3.1,0.7,1.0,0.207,0.011,2.0,-79.1,0.004,249.933,-0.004,-329.033,0.023,-47.167,-0.001,-50.733,-18.8,-138.9,-95.2,-39.833,-127.233,7.033,25.433
2020-21,Denver Nuggets,-2.1,4.8,-0.2,2.0,0.007,0.011,5.0,55.9,0.019,-32.067,0.011,87.967,0.021,-165.167,0.024,50.267,-39.8,147.1,36.8,-27.833,-24.233,-14.967,213.433
2020-21,Detroit Pistons,-1.3,-4.3,0.2,0.0,-1.593,0.011,0.0,-204.1,-0.014,-124.067,-0.015,-80.033,-0.016,111.833,-0.02,-13.733,-100.8,-42.9,-14.2,20.167,78.767,88.033,-394.567
2020-21,Golden State Warriors,3.0,-1.2,-2.2,0.0,0.607,-0.072,-10.0,-19.1,0.002,294.933,0.01,-314.033,0.01,-51.167,0.006,-133.733,42.2,205.1,41.8,-8.833,83.767,137.033,116.433


In [62]:
ok = this_year_rel.drop(["Team", "Playoffs"], axis=1)
bet = clf.predict(ok)

this_year['Prediction'] = bet
this_year

Unnamed: 0,Team,Season,Rel Pace,Rel ORtg,Rel DRtg,Playoffs,Age,Ht.,Wt.,FGA,FG%,3PA,3P%,2PA,2P%,FTA,FT%,ORB,DRB,AST,STL,BLK,TOV,PF,PTS,Prediction
0,Atlanta Hawks,2020-21,-1.6,3.4,1.0,2.0,25.4,6.5,214,6281,0.468,2402,0.373,3879,0.526,1745,0.812,760,2525,1737,503,342,953,1392,8186,1
1,Brooklyn Nets,2020-21,0.3,6.0,1.5,2.0,28.2,6.5,219,6289,0.494,2600,0.392,3689,0.565,1623,0.804,640,2559,1929,484,379,975,1371,8537,1
2,Boston Celtics,2020-21,-0.9,1.7,0.2,1.0,25.1,6.417,220,6401,0.466,2618,0.374,3783,0.53,1496,0.775,765,2421,1689,556,383,1012,1471,8109,1
3,Charlotte Hornets,2020-21,-0.9,-1.4,0.5,0.0,24.6,6.417,213,6324,0.455,2666,0.369,3658,0.517,1505,0.761,762,2389,1933,565,344,1069,1298,7881,1
4,Chicago Bulls,2020-21,-0.2,-1.2,-0.3,0.0,25.6,6.583,218,6380,0.476,2446,0.37,3934,0.542,1258,0.791,693,2544,1927,482,304,1089,1362,7969,1
5,Cleveland Cavaliers,2020-21,-1.9,-6.5,2.1,0.0,24.0,6.5,223,6175,0.45,2141,0.336,4034,0.51,1614,0.743,751,2327,1716,559,325,1114,1308,7476,1
6,Dallas Mavericks,2020-21,-1.9,3.1,0.7,1.0,26.3,6.5,219,6287,0.47,2744,0.362,3543,0.554,1524,0.778,657,2463,1647,450,311,869,1396,8096,1
7,Denver Nuggets,2020-21,-2.1,4.8,-0.2,2.0,26.1,6.5,222,6422,0.485,2462,0.377,3960,0.552,1406,0.803,758,2442,1933,582,323,972,1374,8284,1
8,Detroit Pistons,2020-21,-1.3,-4.3,0.2,0.0,24.5,6.5,217,6162,0.452,2370,0.351,3792,0.515,1683,0.759,694,2381,1743,531,371,1075,1477,7676,1
9,Golden State Warriors,2020-21,3.0,-1.2,-2.2,0.0,26.7,6.417,207,6347,0.468,2789,0.376,3558,0.541,1520,0.785,574,2524,1991,587,342,1080,1526,8187,1


In [None]:
# # Create the GridSearchCV model
# from sklearn.model_selection import GridSearchCV

# svc = SVC()

# hyper = {'kernel': ['linear', 'rbf'], 
#          'C': [1, 10]}

# grid = GridSearchCV(svc, hyper, cv = 3, verbose = 1, n_jobs = -1)

# # Train the model with GridSearch
# fitted = grid.fit(X_train_scaled, y_train)

# print(fitted.best_params_)
# print(fitted.best_score_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


{'C': 1, 'kernel': 'linear'}
0.6235011990407674


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    1.7s finished
  y = column_or_1d(y, warn=True)


In [None]:
# # Scale your data

# from sklearn.preprocessing import StandardScaler

# X_scaler = StandardScaler().fit(X_train)

# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

In [None]:
# # train random forest classifier
# from sklearn.ensemble import RandomForestClassifier

# rf = RandomForestClassifier(n_estimators=10, max_depth=7, random_state=42)
# rf = rf.fit(X_train, y_train)
# print(rf.score(X_test, y_test))


0.37410071942446044


In [None]:
# # weighted importance of each stat towards winning a basketball game
# sorted(zip(rf.feature_importances_, X.columns), reverse=True)

[(0.22519342369399933, 'Rel DRtg'),
 (0.13991419395076107, 'Rel ORtg'),
 (0.07044155547058217, '2P%'),
 (0.06283118141791252, 'FG%'),
 (0.04968657767911596, 'Age'),
 (0.037998327702769355, 'PTS'),
 (0.03723811458624869, 'TOV'),
 (0.03287534775270163, 'BLK'),
 (0.030751116533333362, 'DRB'),
 (0.027849049420803747, '2PA'),
 (0.027563467493102088, 'AST'),
 (0.02727288300368665, 'STL'),
 (0.02687601837558587, 'PF'),
 (0.024846539757171132, 'Rel Pace'),
 (0.02450003764836123, '3PA'),
 (0.02402596403965316, 'ORB'),
 (0.023450504039110566, 'FGA'),
 (0.023329828375511435, 'Ht.'),
 (0.021614804132599193, 'FT%'),
 (0.02154755126868887, 'Wt.'),
 (0.020589592569737584, 'FTA'),
 (0.019603921088564326, '3P%')]

In [None]:
# # create array to hold binary model predictions
# predictions = []
# nope = []

# # convert hot encoded preditions back to binary values
# for pred in rf.predict(X):
#   if pred[0] == 1.:
#     predictions.append(0)
#   elif pred[1] == 1.:
#     predictions.append(1)
#   elif pred[2] == 1.:
#     predictions.append(2)
#   elif pred[3] == 1.:
#     predictions.append(3)
#   elif pred[4] == 1.:
#     predictions.append(4)
#   elif pred[5] == 1.:
#     predictions.append(5)
#   else: nope.append(pred)


# nope

# # # create new dataframe to hold stats with predictions
# # champs = all_rel_stats
# # champs["Predicted"] = predictions

# # # show dataframe with all stats and championship predictions
# # champs

In [None]:
# # create lists to hold overachieving and disappointing teams
# overachievers = []
# disappointments = []

# # iterate through all seasons for all teams
# for i in range(0, len(champs["Season"])):

#   # if model said a team wouldnt win the chip but they do then add them to overachievers
#   if (champs["Chip?"].values[i] == 1) and (champs["Predicted"].values[i] == 0):
#     overachievers.append(f'{champs["Season"].values[i]} {champs["Team"].values[i]}')

#   # if model said a team would win the chip but the don't then add them to disappointments
#   elif (champs["Chip?"].values[i] == 0) and (champs["Predicted"].values[i] == 1):
#     disappointments.append(f'{champs["Season"].values[i]} {champs["Team"].values[i]}')

# # show both lists
# print(overachievers)
# print(disappointments)


['1975-76 New York Nets', '2007-08 Boston Celtics', '1975-76 Boston Celtics', '2015-16 Cleveland Cavaliers', '2010-11 Dallas Mavericks', '1988-89 Detroit Pistons', '2016-17 Golden State Warriors', '1987-88 Los Angeles Lakers', '1972-73 New York Knicks', '1978-79 Seattle SuperSonics', '2004-05 San Antonio Spurs', '1998-99 San Antonio Spurs', '2018-19 Toronto Raptors', '1977-78 Washington Bullets']
['1971-72 Milwaukee Bucks']


In [None]:
# # tune hyperparameters
# from sklearn.model_selection import GridSearchCV

# # create new model whose hyperparamaters are to be tuned
# forest = RandomForestClassifier(random_state=42)

# # store some tuning options in a dictionary
# hyper = {'n_estimators': [10, 25, 50, 100, 200], 
#          'max_depth': [3, 5, 8, 15],
#          'max_features': ['auto', 'sqrt', 'log2']}

# # create GridSearch model
# grid = GridSearchCV(forest, hyper, cv = 3, verbose = 1, n_jobs = -1)

# # Train the model with GridSearch
# fitted = grid.fit(X_train, y_train)

# # display best parameters and the score they get
# print(fitted.best_params_)
# print(fitted.best_score_)


{'max_depth': 8, 'max_features': 'auto', 'n_estimators': 10}
0.4412470023980815


In [None]:
# best_forest = RandomForestClassifier(max_depth=3, n_estimators=10, random_state=42)
# bf = best_forest.fit(X_train_scaled, y_train)
# # **(why is score different than the "fitted" score in the cell above)
# print(bf.score(X_test_scaled, y_test))


In [None]:
#import pandas as pd
#import numpy as np
#import matplotlib.pyplot as plt

#from matplotlib import style
#style.use("ggplot")
#from matplotlib import rcParams
#rcParams['figure.figsize'] = 10, 8

In [None]:
#df = pd.dataFrame(os.path.join("..", "Chip?", "Season", "Team"))
#df.head()

In [None]:
#target = df["Chip?"]
#target_names = ["Season", "team"]

In [None]:
#data = df.drop("Chip?", axis=1)
#feature_names = data.columns
#data.head()

In [None]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
#from sklearn.svm import SVC 
#model = SVC(kernel='linear')
#model.fit(X_train, y_train)

In [None]:
#print('Test Acc: %.3f' % model.score(X_test, y_test))

In [None]:
#from sklearn.metrics import classification_report
#predictions = model.predict(X_test)
#print(classification_report(y_test, predictions,
                            #target_names=target_names))

**ETL**

In [None]:
import os
# Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
spark_version = 'spark-3.1.1'

os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [697 B]
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Get:7 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Hit:9 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:10 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ Packages [61.8 kB]
Get:11 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:13 htt

In [None]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2021-06-10 00:15:45--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar’


2021-06-10 00:15:45 (4.33 MB/s) - ‘postgresql-42.2.9.jar’ saved [914037/914037]



In [None]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NbaChamps").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [None]:
all_stats.dtypes

Team         object
Season       object
Rel Pace    float64
Rel ORtg    float64
Rel DRtg    float64
Chip?        object
dtype: object

In [None]:
# Rename Columns for postgres consumption
all_stats_copy = all_stats.copy()
all_stats_py = all_stats_copy.rename(columns={"Rel Pace": "Rel_Pace", "Rel ORtg": "Rel_ORtg", "Rel DRtg": "Rel_DRtg", "Chip?": "Chip"})

# Convert Pandas df to Pyspark df
all_stats_py = spark.createDataFrame(all_stats_py)
all_stats_py.printSchema()
# all_stats_py

root
 |-- Team: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- Rel_Pace: double (nullable = true)
 |-- Rel_ORtg: double (nullable = true)
 |-- Rel_DRtg: double (nullable = true)
 |-- Chip: long (nullable = true)



In [None]:
# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://nba-champs.c6ka6apltccn.us-east-2.rds.amazonaws.com:5432/nbaChamps"
config = {"user":"postgres",
          "password": "MildredChase84!",
          "driver":"org.postgresql.Driver"}

In [None]:
# Write DataFrame to Nba champs table in RDS

all_stats_py.write.jdbc(url=jdbc_url, table='all_stats', mode=mode, properties=config)