<a href="https://colab.research.google.com/github/d-jenkins/NBA_Champs/blob/main/Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [53]:
# import all dependencies needed
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
from bs4 import Comment
import requests
import json
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC


In [54]:
# list holding abbviations for all nba teams to create urls
teams = ['ATL', 'NJN', 'BOS', 'CHA', 'CHI', 
         'CLE', 'DAL', 'DEN', 'DET', 'GSW', 
         'HOU', 'IND', 'LAC', 'LAL', 'MEM', 
         'MIA', 'MIL', 'MIN', 'NOH', 'NYK', 
         'OKC', 'ORL', 'PHI', 'PHO', 'POR', 
         'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

# array to hold all tables for all teams for all seasons
every_season = []




# iterate through list of all team abbreviations
for team in teams:

  # create url to scrape for team
  url = f'https://www.basketball-reference.com/teams/{team}'

  # scrape team's stats from their bball reference page
  stats = pd.read_html(url)[0]

  # select only the desired columns 
  stats = stats[["Team", "Season", "Rel Pace", "Rel ORtg", "Rel DRtg", "Playoffs"]]

  # numbers that correspond to playoff results
  playoffs = {np.nan: 0, 
              'Lost E. Conf. 1st Rnd.': 1,
              'Lost W. Conf. 1st Rnd.': 1,
              'Won E. Conf. 1st Rnd.' : 2,
              'Won W. Conf. 1st Rnd.' : 2,
              'Lost Quarterfinals': 2,
              'Lost E. Conf. Semis': 2,
              'Lost E. Div. Semis': 2,
              'Lost W. Conf. Semis': 2,  
              'Lost W. Div. Semis': 2, 
              'Lost E. Conf. Finals': 3,
              'Lost E. Div. Finals': 3,
              'Lost W. Conf. Finals': 3,
              'Lost W. Div. Finals': 3, 
              'Lost Finals': 4, 
              'Won Finals': 5}

  # covert playoff results to numerical values
  stats["Playoffs"] = stats["Playoffs"].map(playoffs)

  # select only seasons since 80s except for 2020-21 and reset index
  stats = stats.iloc[1:42, :].reset_index(drop=True)

  # remove asterisk from team name
  stats["Team"] = stats["Team"].apply(lambda t: t.replace("*", ""))




  # create url to scrape other table for team
  url = f'https://www.basketball-reference.com/teams/{team}/stats_basic_totals.html'

  # desired statistical categories
  categories = ['Age', 'Ht.', 'Wt.', 'FGA', 
                'FG%', '3PA', '3P%', '2PA', 
                '2P%', 'FTA', 'FT%', 'ORB', 
                'DRB', 'AST', 'STL', 'BLK', 
                'TOV', 'PF', 'PTS']
  
  # scrape more of team's stats from their bball reference page
  more_stats = pd.read_html(url)[0][categories]

  # select only seasons since 80s except for 2020-21
  more_stats = more_stats.iloc[1:43, :]

  # remove dumb rows that restate stat categories and set index
  more_stats = more_stats.loc[more_stats['PTS'] != 'PTS', :].reset_index(drop=True)

  # convert heights to numerical values
  more_stats['Ht.'] = more_stats['Ht.'].apply(lambda h: round(int(h.replace('6-', ''))/12 + 6, 3))




  # merge both stat dataframes
  stats = pd.merge(stats, more_stats, left_index=True, right_index=True)

  # add table of team's stats to a list of tables holding all teams' stats
  every_season.append(stats)




# combine all teams stats into one dataframe
all_stats = pd.concat(every_season).reset_index(drop=True)



In [55]:
# create list to hold all seasons
seasons = all_stats["Season"].unique()

# create new data frame to hold all stats relative the the average of that season
all_rel_stats = all_stats.copy()
all_rel_stats.set_index('Season', inplace=True)

# calculate relative values for all stats
for season in seasons:
  for category in categories:
    avg = round(pd.to_numeric(all_stats.loc[all_stats["Season"] == season, :][category]).mean(), 3)
    all_rel_stats.loc[season, category] = all_rel_stats.loc[season, category].apply(lambda x: float(x)- avg)
    # print(f'{season} {category}: {avg}')

# show resulting dataframe
all_rel_stats


Unnamed: 0_level_0,Team,Rel Pace,Rel ORtg,Rel DRtg,Playoffs,Age,Ht.,Wt.,FGA,FG%,3PA,3P%,2PA,2P%,FTA,FT%,ORB,DRB,AST,STL,BLK,TOV,PF,PTS
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2019-20,Atlanta Hawks,2.7,-3.4,4.2,0.0,-1.92,0.017,-1.733,-203.533,-0.011,7.6,-0.025,-211.133,0.001,-65.433,0.018,-50.333,-216.9,-116.633,-17,-4.933,59.267,81.2,-404.933
2018-19,Atlanta Hawks,3.9,-2.3,3.5,0.0,-1.177,0.036,-2.733,208.733,-0.009,409.267,-0.003,-200.533,-0.002,26.067,-0.015,106.533,-30.1,101.9,49.033,12.833,242.2,217.833,174.9
2017-18,Atlanta Hawks,1.0,-3.6,2.0,0.0,-1.047,-0.041,-6.2,-41.967,-0.014,166,-0.002,-207.967,-0.016,-123.5,0.018,-53.367,-79.033,40.567,5.233,-46.867,106.267,-21.9,-244.333
2016-17,Atlanta Hawks,1.0,-3.9,-3.1,1.0,1.313,-0.044,-0.8,-85.833,-0.006,-77.067,-0.016,-8.767,-0.003,143.833,-0.044,10.767,56.033,82.667,40.333,8.033,150,-140.667,-199.433
2015-16,Atlanta Hawks,1.3,-1.3,-5.0,2.0,1.563,-0.044,-3.333,-11.967,0.006,351.3,-0.003,-363.267,0.02,-277.633,0.025,-175.133,37.967,272.267,103.567,79.567,46.533,-91.8,13.933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1984-85,Washington Bullets,-1.8,-3.6,-3.3,1.0,0.652,-0.029,3.043,76.565,-0.012,140.739,0,-64.174,-0.009,-418.87,-0.02,-161.87,3.304,-64.826,8.043,-42.609,-182.957,-176.043,-433.739
1983-84,Washington Bullets,-4.0,-3.4,-0.4,1.0,-0.183,-0.029,9.783,-338.13,-0.008,87.043,0.01,-425.174,-0.005,-233.435,-0.003,-137,22.739,44.304,-141.435,-114.957,-19.609,-122.87,-606.043
1982-83,Washington Bullets,-3.7,-5.6,-5.4,0.0,-0.3,-0.022,10.043,-293.087,-0.017,52.304,0.064,-345.391,-0.018,-259.957,-0.034,-117.087,0.304,-78.435,3.522,-59.391,21.13,-145.174,-764.174
1981-82,Washington Bullets,-1.5,-3.6,-4.4,2.0,-0.161,-0.011,7.304,-67.565,-0.017,48.696,-0.003,-116.261,-0.015,-238,0.027,-128.609,193.957,-79.522,-57.304,-43.478,-63.696,-73.739,-418.304


In [56]:
# select/adjust data to train ml model

# uncomment below to use only data for playoff teams
rel_playoffs = all_rel_stats.loc[all_rel_stats['Playoffs'] > 0, :]
X = rel_playoffs.drop(["Team", "Playoffs"], axis=1)
y = rel_playoffs["Playoffs"].values

# uncomment to use all data
# X = all_rel_stats.drop(["Team", "Playoffs"], axis=1)
# y = all_rel_stats["Playoffs"].values.reshape(-1, 1)


# uncomment to one hot encode data
# from sklearn.preprocessing import LabelEncoder
# from tensorflow.keras.utils import to_categorical
# label-encode y data
# label_encoder = LabelEncoder()
# label_encoder.fit(y)
# encoded_y = label_encoder.transform(y)
# # One-hot encode y data
# y = to_categorical(encoded_y)


In [57]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# train model
clf = SVC()
clf.fit(X_train, y_train)

# test model
this = clf.predict(X)
clf.score(X_test, y_test)


0.4585987261146497

In [58]:
# create dataframe to only hold data for playoff teams
champs = all_rel_stats.copy().loc[all_rel_stats['Playoffs'] > 0, :]

# add column to hold playoff success predicted by model
champs['Predicted'] = this

# add column with how wrong the models prediction was
champs['Error'] = champs['Playoffs'].values - champs['Predicted'].values

# rename columns to not have spaces
champs = champs.rename(columns={"Rel Pace":"Rel_Pace", "Rel ORtg":"Rel_ORtg",  "Rel DRtg":"Rel_DRtg"})

# reset index back to ascending numbers
champs.reset_index(inplace=True)

# adjust columns to hold desired data types
for column in champs.columns:
  if column in categories:
    champs[column] = pd.to_numeric(champs[column])
  elif (column == 'Playoffs') or (column == 'Predicted') or (column == 'Error'):
    champs[column] = champs[column].astype(int)

# output csv for data use while flask app is being developed
champs.to_csv('champs.csv')

# show resulting dataframe
champs


Unnamed: 0,Season,Team,Rel_Pace,Rel_ORtg,Rel_DRtg,Playoffs,Age,Ht.,Wt.,FGA,FG%,3PA,3P%,2PA,2P%,FTA,FT%,ORB,DRB,AST,STL,BLK,TOV,PF,PTS,Predicted,Error
0,2016-17,Atlanta Hawks,1.0,-3.9,-3.1,1,1.313,-0.044,-0.800,-85.833,-0.006,-77.067,-0.016,-8.767,-0.003,143.833,-0.044,10.767,56.033,82.667,40.333,8.033,150.000,-140.667,-199.433,1,0
1,2015-16,Atlanta Hawks,1.3,-1.3,-5.0,2,1.563,-0.044,-3.333,-11.967,0.006,351.300,-0.003,-363.267,0.020,-277.633,0.025,-175.133,37.967,272.267,103.567,79.567,46.533,-91.800,13.933,1,1
2,2014-15,Atlanta Hawks,0.0,3.3,-2.5,3,1.053,-0.044,0.367,-153.333,0.017,314.100,0.031,-467.433,0.020,-138.267,0.027,-177.700,-46.433,304.267,109.633,-13.233,-10.067,-200.600,207.833,1,2
3,2013-14,Atlanta Hawks,0.7,-0.8,-0.3,1,1.060,-0.039,2.567,-117.733,0.003,350.200,0.004,-467.933,0.013,-152.300,0.024,-181.867,-45.500,236.633,50.167,-60.467,49.867,-120.433,-0.733,1,0
4,2012-13,Atlanta Hawks,0.6,-1.1,-1.5,1,0.440,-0.033,-3.067,-76.300,0.011,265.433,0.013,-341.733,0.017,-198.767,-0.038,-157.200,55.700,193.167,24.733,-51.733,26.767,-152.833,-8.767,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
623,1986-87,Washington Bullets,-0.6,-2.6,-1.3,1,-0.700,-0.061,-3.957,116.348,-0.026,-169.522,-0.093,285.870,-0.030,33.217,0.002,99.565,-89.130,-380.348,47.870,232.348,-92.043,-237.043,-324.696,1,0
624,1985-86,Washington Bullets,-3.0,-4.2,-2.4,1,0.330,0.032,-2.739,-120.087,-0.024,134.391,0.016,-254.478,-0.021,-195.696,-0.011,-93.174,18.957,-385.261,-94.696,285.130,-116.826,-271.087,-596.043,1,0
625,1984-85,Washington Bullets,-1.8,-3.6,-3.3,1,0.652,-0.029,3.043,76.565,-0.012,140.739,0.000,-64.174,-0.009,-418.870,-0.020,-161.870,3.304,-64.826,8.043,-42.609,-182.957,-176.043,-433.739,1,0
626,1983-84,Washington Bullets,-4.0,-3.4,-0.4,1,-0.183,-0.029,9.783,-338.130,-0.008,87.043,0.010,-425.174,-0.005,-233.435,-0.003,-137.000,22.739,44.304,-141.435,-114.957,-19.609,-122.870,-606.043,1,0


In [59]:
# most overachieving teams
champs.sort_values('Error', ascending=False)[['Season', 'Team', 'Predicted', 'Playoffs', 'Error']].head(20)


Unnamed: 0,Season,Team,Predicted,Playoffs,Error
573,2018-19,Toronto Raptors,1,5,4
74,1980-81,Boston Celtics,1,5,4
291,2000-01,Los Angeles Lakers,1,5,4
290,2001-02,Los Angeles Lakers,1,5,4
284,2008-09,Los Angeles Lakers,1,5,4
283,2009-10,Los Angeles Lakers,1,5,4
182,2003-04,Detroit Pistons,1,5,4
325,2012-13,Miami Heat,1,5,4
326,2011-12,Miami Heat,1,5,4
135,2010-11,Dallas Mavericks,1,5,4


In [61]:
# most disappointing teams
champs.sort_values('Error')[['Season', 'Team', 'Predicted', 'Playoffs', 'Error']].head(20)


Unnamed: 0,Season,Team,Predicted,Playoffs,Error
470,2007-08,Phoenix Suns,5,1,-4
305,1985-86,Los Angeles Lakers,5,3,-2
308,1982-83,Los Angeles Lakers,5,4,-1
307,1983-84,Los Angeles Lakers,5,4,-1
543,2012-13,San Antonio Spurs,5,4,-1
198,2018-19,Golden State Warriors,5,4,-1
254,2000-01,Indiana Pacers,1,1,0
253,2001-02,Indiana Pacers,1,1,0
252,2002-03,Indiana Pacers,1,1,0
477,1998-99,Phoenix Suns,1,1,0


In [62]:
# Correctly predicted
champs.loc[champs['Error'] == 0, :].sort_values('Playoffs', ascending=False)[['Team', 'Predicted', 'Playoffs', 'Error']].head(20)


Unnamed: 0,Team,Predicted,Playoffs,Error
97,Chicago Bulls,5,5,0
199,Golden State Warriors,5,5,0
200,Golden State Warriors,5,5,0
304,Los Angeles Lakers,5,5,0
306,Los Angeles Lakers,5,5,0
69,Boston Celtics,5,5,0
542,San Antonio Spurs,5,5,0
343,Milwaukee Bucks,3,3,0
67,Boston Celtics,3,3,0
481,Phoenix Suns,2,2,0


In [18]:
# list holding abbviations for all nba teams to create urls
teams = ['ATL', 'NJN', 'BOS', 'CHA', 'CHI', 
         'CLE', 'DAL', 'DEN', 'DET', 'GSW', 
         'HOU', 'IND', 'LAC', 'LAL', 'MEM', 
         'MIA', 'MIL', 'MIN', 'NOH', 'NYK', 
         'OKC', 'ORL', 'PHI', 'PHO', 'POR', 
         'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

# array to hold all tables for all teams for all seasons
every_season = []

# iterate through list of all team abbreviations
for team in teams:

  # create url to scrape for team
  url = f'https://www.basketball-reference.com/teams/{team}'

  # scrape team's stats from their bball reference page
  stats = pd.read_html(url)[0]

  # select only the desired columns 
  stats = stats[["Team", "Season", "Rel Pace", "Rel ORtg", "Rel DRtg", "Playoffs"]]

  # numbers that correspond to playoff results
  playoffs = {np.nan: 0, 
              'Lost E. Conf. 1st Rnd.': 1,
              'Lost W. Conf. 1st Rnd.': 1,
              'Won E. Conf. 1st Rnd.' : 2,
              'Won W. Conf. 1st Rnd.' : 2,
              'Lost Quarterfinals': 2,
              'Lost E. Conf. Semis': 2,
              'Lost E. Div. Semis': 2,
              'Lost W. Conf. Semis': 2,  
              'Lost W. Div. Semis': 2, 
              'Lost E. Conf. Finals': 3,
              'Lost E. Div. Finals': 3,
              'Lost W. Conf. Finals': 3,
              'Lost W. Div. Finals': 3, 
              'Lost Finals': 4, 
              'Won Finals': 5}

  # covert playoff results to numerical values
  stats["Playoffs"] = stats["Playoffs"].map(playoffs)

  # select only seasons since 80s except for 2020-21 and reset index
  stats = stats.iloc[0:1, :].reset_index(drop=True)

  # remove asterisk from team name
  def rename(team): return(team.replace("*", ""))
  stats["Team"] = stats["Team"].apply(rename)



  # create url to scrape other table for team
  url = f'https://www.basketball-reference.com/teams/{team}/stats_basic_totals.html'

  # desired statistical categories
  categories = ['Age', 'Ht.', 'Wt.', 'FGA', 
                'FG%', '3PA', '3P%', '2PA', 
                '2P%', 'FTA', 'FT%', 'ORB', 
                'DRB', 'AST', 'STL', 'BLK', 
                'TOV', 'PF', 'PTS']
  
  # scrape more of team's stats from their bball reference page
  more_stats = pd.read_html(url)[0][categories]

  # select only seasons since 80s except for 2020-21
  more_stats = more_stats.iloc[0:1, :]

  # remove dumb rows that restate stat categories and set index
  more_stats = more_stats.loc[more_stats['PTS'] != 'PTS', :].reset_index(drop=True)

  # convert heights to numerical values
  more_stats['Ht.'] = more_stats['Ht.'].apply(lambda h: round(int(h.replace('6-', ''))/12 + 6, 3))



  # merge both stat dataframes
  stats = pd.merge(stats, more_stats, left_index=True, right_index=True)

  # add table of team's stats to a list of tables holding all teams' stats
  every_season.append(stats)



# combine all teams stats into one dataframe
this_year = pd.concat(every_season).reset_index(drop=True)




# create list to hold all seasons
seasons = this_year["Season"].unique()

# create new data frame to hold all stats relative the the average of that season
this_year_rel = this_year.copy()
this_year_rel.set_index('Season', inplace=True)

# calculate relative values for all stats

for category in categories:
  avg = round(pd.to_numeric(this_year.loc[this_year["Season"] == '2020-21', :][category]).mean(), 3)
  this_year_rel.loc['2020-21', category] = this_year_rel.loc['2020-21', category].apply(lambda x: float(x)- avg)
  # print(f'{season} {category}: {avg}')

# show resulting dataframe
this_year_rel



Unnamed: 0_level_0,Team,Rel Pace,Rel ORtg,Rel DRtg,Playoffs,Age,Ht.,Wt.,FGA,FG%,3PA,3P%,2PA,2P%,FTA,FT%,ORB,DRB,AST,STL,BLK,TOV,PF,PTS
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2020-21,Atlanta Hawks,-1.6,3.4,1.0,2.0,-0.693,0.011,-3.0,-85.1,0.002,-92.067,0.007,6.967,-0.005,173.833,0.033,52.267,43.2,-48.9,-42.2,-8.833,-43.233,3.033,115.433
2020-21,Brooklyn Nets,0.3,6.0,1.5,2.0,2.107,0.011,2.0,-77.1,0.028,105.933,0.026,-183.033,0.034,51.833,0.025,-67.733,77.2,143.1,-61.2,28.167,-21.233,-17.967,466.433
2020-21,Boston Celtics,-0.9,1.7,0.2,1.0,-0.993,-0.072,3.0,34.9,0.0,123.933,0.008,-89.033,-0.001,-75.167,-0.004,57.267,-60.8,-96.9,10.8,32.167,15.767,82.033,38.433
2020-21,Charlotte Hornets,-0.9,-1.4,0.5,0.0,-1.493,-0.072,-4.0,-42.1,-0.011,171.933,0.003,-214.033,-0.014,-66.167,-0.018,54.267,-92.8,147.1,19.8,-6.833,72.767,-90.967,-189.567
2020-21,Chicago Bulls,-0.2,-1.2,-0.3,0.0,-0.493,0.094,1.0,13.9,0.01,-48.067,0.004,61.967,0.011,-313.167,0.012,-14.733,62.2,141.1,-63.2,-46.833,92.767,-26.967,-101.567
2020-21,Cleveland Cavaliers,-1.9,-6.5,2.1,0.0,-2.093,0.011,6.0,-191.1,-0.016,-353.067,-0.03,161.967,-0.021,42.833,-0.036,43.267,-154.8,-69.9,13.8,-25.833,117.767,-80.967,-594.567
2020-21,Dallas Mavericks,-1.9,3.1,0.7,1.0,0.207,0.011,2.0,-79.1,0.004,249.933,-0.004,-329.033,0.023,-47.167,-0.001,-50.733,-18.8,-138.9,-95.2,-39.833,-127.233,7.033,25.433
2020-21,Denver Nuggets,-2.1,4.8,-0.2,2.0,0.007,0.011,5.0,55.9,0.019,-32.067,0.011,87.967,0.021,-165.167,0.024,50.267,-39.8,147.1,36.8,-27.833,-24.233,-14.967,213.433
2020-21,Detroit Pistons,-1.3,-4.3,0.2,0.0,-1.593,0.011,0.0,-204.1,-0.014,-124.067,-0.015,-80.033,-0.016,111.833,-0.02,-13.733,-100.8,-42.9,-14.2,20.167,78.767,88.033,-394.567
2020-21,Golden State Warriors,3.0,-1.2,-2.2,0.0,0.607,-0.072,-10.0,-19.1,0.002,294.933,0.01,-314.033,0.01,-51.167,0.006,-133.733,42.2,205.1,41.8,-8.833,83.767,137.033,116.433


In [24]:
this_playoffs = this_year_rel.loc[this_year_rel['Playoffs'] > 0, :]

ok = this_playoffs.drop(["Team", "Playoffs"], axis=1)

bet = clf.predict(ok)


this_playoffs['Prediction'] = bet
this_playoffs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Team,Rel Pace,Rel ORtg,Rel DRtg,Playoffs,Age,Ht.,Wt.,FGA,FG%,3PA,3P%,2PA,2P%,FTA,FT%,ORB,DRB,AST,STL,BLK,TOV,PF,PTS,Prediction
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
2020-21,Atlanta Hawks,-1.6,3.4,1.0,2.0,-0.693,0.011,-3.0,-85.1,0.002,-92.067,0.007,6.967,-0.005,173.833,0.033,52.267,43.2,-48.9,-42.2,-8.833,-43.233,3.033,115.433,1.0
2020-21,Brooklyn Nets,0.3,6.0,1.5,2.0,2.107,0.011,2.0,-77.1,0.028,105.933,0.026,-183.033,0.034,51.833,0.025,-67.733,77.2,143.1,-61.2,28.167,-21.233,-17.967,466.433,1.0
2020-21,Boston Celtics,-0.9,1.7,0.2,1.0,-0.993,-0.072,3.0,34.9,0.0,123.933,0.008,-89.033,-0.001,-75.167,-0.004,57.267,-60.8,-96.9,10.8,32.167,15.767,82.033,38.433,1.0
2020-21,Dallas Mavericks,-1.9,3.1,0.7,1.0,0.207,0.011,2.0,-79.1,0.004,249.933,-0.004,-329.033,0.023,-47.167,-0.001,-50.733,-18.8,-138.9,-95.2,-39.833,-127.233,7.033,25.433,1.0
2020-21,Denver Nuggets,-2.1,4.8,-0.2,2.0,0.007,0.011,5.0,55.9,0.019,-32.067,0.011,87.967,0.021,-165.167,0.024,50.267,-39.8,147.1,36.8,-27.833,-24.233,-14.967,213.433,1.0
2020-21,Los Angeles Clippers,-2.3,5.3,-1.1,2.0,2.707,0.011,0.0,-124.1,0.016,3.933,0.045,-128.033,-0.002,-184.167,0.06,-29.733,19.2,-29.9,-36.2,-55.833,-46.233,-5.967,138.433,1.0
2020-21,Los Angeles Lakers,-0.5,-2.4,-5.2,1.0,2.107,0.011,6.0,-169.1,0.006,-246.067,-0.012,76.967,0.008,107.833,-0.04,-12.733,8.2,-10.9,16.8,35.167,98.767,-11.967,-183.567,1.0
2020-21,Memphis Grizzlies,1.2,-0.3,-1.3,1.0,-1.893,0.011,0.0,241.9,0.001,-236.067,-0.01,477.967,-0.006,-35.167,-0.008,95.267,61.2,152.1,109.8,13.167,-39.233,-39.967,86.433,1.0
2020-21,Miami Heat,-2.6,-1.1,-1.1,1.0,1.307,0.011,0.0,-337.1,0.002,111.933,-0.008,-449.033,0.022,-51.167,0.011,-128.733,-72.8,109.1,23.8,-64.833,16.767,-25.967,-289.567,1.0
2020-21,Milwaukee Bucks,3.0,4.9,-0.9,2.0,2.007,0.011,8.0,243.9,0.021,174.933,0.023,68.967,0.023,-32.167,-0.019,33.267,242.2,48.1,39.8,-16.833,-1.233,-144.967,578.433,2.0


In [None]:
# # Create the GridSearchCV model
# from sklearn.model_selection import GridSearchCV

# svc = SVC()

# hyper = {'kernel': ['linear', 'rbf'], 
#          'C': [1, 10]}

# grid = GridSearchCV(svc, hyper, cv = 3, verbose = 1, n_jobs = -1)

# # Train the model with GridSearch
# fitted = grid.fit(X_train_scaled, y_train)

# print(fitted.best_params_)
# print(fitted.best_score_)

In [None]:
# # Scale your data

# from sklearn.preprocessing import StandardScaler

# X_scaler = StandardScaler().fit(X_train)

# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

In [None]:
# # train random forest classifier
# from sklearn.ensemble import RandomForestClassifier

# rf = RandomForestClassifier(n_estimators=10, max_depth=7, random_state=42)
# rf = rf.fit(X_train, y_train)
# print(rf.score(X_test, y_test))


0.2929936305732484


In [None]:
# this = rf.predict(X)

# [np.argmax(x) for x in this]



In [None]:
# # weighted importance of each stat towards winning a basketball game
# sorted(zip(rf.feature_importances_, X.columns), reverse=True)

In [None]:
# # create array to hold binary model predictions
# predictions = []
# nope = []

# # convert hot encoded preditions back to binary values
# for pred in rf.predict(X):
#   if pred[0] == 1.:
#     predictions.append(0)
#   elif pred[1] == 1.:
#     predictions.append(1)
#   elif pred[2] == 1.:
#     predictions.append(2)
#   elif pred[3] == 1.:
#     predictions.append(3)
#   elif pred[4] == 1.:
#     predictions.append(4)
#   # elif pred[5] == 1.:
#   #   predictions.append(5)
#   else: nope.append(pred)


# nope

# # # create new dataframe to hold stats with predictions
# # champs = all_rel_stats
# # champs["Predicted"] = predictions

# # # show dataframe with all stats and championship predictions
# # champs

[array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0.

In [None]:
# # create lists to hold overachieving and disappointing teams
# overachievers = []
# disappointments = []

# # iterate through all seasons for all teams
# for i in range(0, len(champs["Season"])):

#   # if model said a team wouldnt win the chip but they do then add them to overachievers
#   if (champs["Chip?"].values[i] == 1) and (champs["Predicted"].values[i] == 0):
#     overachievers.append(f'{champs["Season"].values[i]} {champs["Team"].values[i]}')

#   # if model said a team would win the chip but the don't then add them to disappointments
#   elif (champs["Chip?"].values[i] == 0) and (champs["Predicted"].values[i] == 1):
#     disappointments.append(f'{champs["Season"].values[i]} {champs["Team"].values[i]}')

# # show both lists
# print(overachievers)
# print(disappointments)


['1975-76 New York Nets', '2007-08 Boston Celtics', '1975-76 Boston Celtics', '2015-16 Cleveland Cavaliers', '2010-11 Dallas Mavericks', '1988-89 Detroit Pistons', '2016-17 Golden State Warriors', '1987-88 Los Angeles Lakers', '1972-73 New York Knicks', '1978-79 Seattle SuperSonics', '2004-05 San Antonio Spurs', '1998-99 San Antonio Spurs', '2018-19 Toronto Raptors', '1977-78 Washington Bullets']
['1971-72 Milwaukee Bucks']


In [None]:
# # tune hyperparameters
# from sklearn.model_selection import GridSearchCV

# # create new model whose hyperparamaters are to be tuned
# forest = RandomForestClassifier(random_state=42)

# # store some tuning options in a dictionary
# hyper = {'n_estimators': [10, 25, 50, 100, 200], 
#          'max_depth': [3, 5, 8, 15],
#          'max_features': ['auto', 'sqrt', 'log2']}

# # create GridSearch model
# grid = GridSearchCV(forest, hyper, cv = 3, verbose = 1, n_jobs = -1)

# # Train the model with GridSearch
# fitted = grid.fit(X_train, y_train)

# # display best parameters and the score they get
# print(fitted.best_params_)
# print(fitted.best_score_)


{'max_depth': 8, 'max_features': 'auto', 'n_estimators': 10}
0.4412470023980815


In [None]:
# best_forest = RandomForestClassifier(max_depth=3, n_estimators=10, random_state=42)
# bf = best_forest.fit(X_train_scaled, y_train)
# # **(why is score different than the "fitted" score in the cell above)
# print(bf.score(X_test_scaled, y_test))


In [None]:
#import pandas as pd
#import numpy as np
#import matplotlib.pyplot as plt

#from matplotlib import style
#style.use("ggplot")
#from matplotlib import rcParams
#rcParams['figure.figsize'] = 10, 8

In [None]:
#df = pd.dataFrame(os.path.join("..", "Chip?", "Season", "Team"))
#df.head()

In [None]:
#target = df["Chip?"]
#target_names = ["Season", "team"]

In [None]:
#data = df.drop("Chip?", axis=1)
#feature_names = data.columns
#data.head()

In [None]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
#from sklearn.svm import SVC 
#model = SVC(kernel='linear')
#model.fit(X_train, y_train)

In [None]:
#print('Test Acc: %.3f' % model.score(X_test, y_test))

In [None]:
#from sklearn.metrics import classification_report
#predictions = model.predict(X_test)
#print(classification_report(y_test, predictions,
                            #target_names=target_names))

**ETL**

In [None]:
import os
# Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
spark_version = 'spark-3.1.1'

os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (91.189.91.38)] [Co                                                                               Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
                                                                               Hit:3 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
                                                                               Hit:4 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (91.189.91.38)] [Co0% [1 InRelease gpgv 15.9 kB] [Waiting for headers] [Connecting to security.ubu                                                                               Hit:5 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
0% [1 InRelease gpgv 15.9 kB] [Connecting to security.ubuntu.com (91.189.91.38

In [None]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2021-06-15 06:18:22--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar.1’


2021-06-15 06:18:24 (1.77 MB/s) - ‘postgresql-42.2.9.jar.1’ saved [914037/914037]



In [None]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NbaChamps").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [None]:
champs.dtypes
champs.head()

Unnamed: 0,Season,Team,Rel_Pace,Rel_ORtg,Rel_DRtg,Playoffs,Age,Ht.,Wt.,FGA,FG%,3PA,3P%,2PA,2P%,FTA,FT%,ORB,DRB,AST,STL,BLK,TOV,PF,PTS,Predicted,Error
0,2016-17,Atlanta Hawks,1.0,-3.9,-3.1,1,1.313,-0.044,-0.8,-85.833,-0.006,-77.067,-0.016,-8.767,-0.003,143.833,-0.044,10.767,56.033,82.667,40.333,8.033,150.0,-140.667,-199.433,1,0
1,2015-16,Atlanta Hawks,1.3,-1.3,-5.0,2,1.563,-0.044,-3.333,-11.967,0.006,351.3,-0.003,-363.267,0.02,-277.633,0.025,-175.133,37.967,272.267,103.567,79.567,46.533,-91.8,13.933,1,1
2,2014-15,Atlanta Hawks,0.0,3.3,-2.5,3,1.053,-0.044,0.367,-153.333,0.017,314.1,0.031,-467.433,0.02,-138.267,0.027,-177.7,-46.433,304.267,109.633,-13.233,-10.067,-200.6,207.833,1,2
3,2013-14,Atlanta Hawks,0.7,-0.8,-0.3,1,1.06,-0.039,2.567,-117.733,0.003,350.2,0.004,-467.933,0.013,-152.3,0.024,-181.867,-45.5,236.633,50.167,-60.467,49.867,-120.433,-0.733,1,0
4,2012-13,Atlanta Hawks,0.6,-1.1,-1.5,1,0.44,-0.033,-3.067,-76.3,0.011,265.433,0.013,-341.733,0.017,-198.767,-0.038,-157.2,55.7,193.167,24.733,-51.733,26.767,-152.833,-8.767,1,0


In [None]:
# Convert Object columns to numeric
all_stats_copy = all_rel_stats.copy()
all_stats_copy['Age'] = pd.to_numeric(all_stats_copy['Age'],errors = 'coerce')
all_stats_copy['Ht.'] = pd.to_numeric(all_stats_copy['Ht.'],errors = 'coerce')
all_stats_copy['Wt.'] = pd.to_numeric(all_stats_copy['Wt.'],errors = 'coerce')
all_stats_copy['FGA'] = pd.to_numeric(all_stats_copy['FGA'],errors = 'coerce')
all_stats_copy['FG%'] = pd.to_numeric(all_stats_copy['FG%'],errors = 'coerce')
all_stats_copy['3PA'] = pd.to_numeric(all_stats_copy['3PA'],errors = 'coerce')
all_stats_copy['3P%'] = pd.to_numeric(all_stats_copy['3P%'],errors = 'coerce')
all_stats_copy['2PA'] = pd.to_numeric(all_stats_copy['2PA'],errors = 'coerce')
all_stats_copy['2P%'] = pd.to_numeric(all_stats_copy['2P%'],errors = 'coerce')
all_stats_copy['FTA'] = pd.to_numeric(all_stats_copy['FTA'],errors = 'coerce')
all_stats_copy['FT%'] = pd.to_numeric(all_stats_copy['FT%'],errors = 'coerce')
all_stats_copy['ORB'] = pd.to_numeric(all_stats_copy['ORB'],errors = 'coerce')
all_stats_copy['DRB'] = pd.to_numeric(all_stats_copy['DRB'],errors = 'coerce')
all_stats_copy['AST'] = pd.to_numeric(all_stats_copy['AST'],errors = 'coerce')
all_stats_copy['STL'] = pd.to_numeric(all_stats_copy['STL'],errors = 'coerce')
all_stats_copy['BLK'] = pd.to_numeric(all_stats_copy['BLK'],errors = 'coerce')
all_stats_copy['TOV'] = pd.to_numeric(all_stats_copy['TOV'],errors = 'coerce')
all_stats_copy['PF'] = pd.to_numeric(all_stats_copy['PF'],errors = 'coerce')
all_stats_copy['PTS'] = pd.to_numeric(all_stats_copy['PTS'],errors = 'coerce')

all_stats_copy.dtypes

Team         object
Rel Pace    float64
Rel ORtg    float64
Rel DRtg    float64
Playoffs    float64
Age         float64
Ht.         float64
Wt.         float64
FGA         float64
FG%         float64
3PA         float64
3P%         float64
2PA         float64
2P%         float64
FTA         float64
FT%         float64
ORB         float64
DRB         float64
AST         float64
STL         float64
BLK         float64
TOV         float64
PF          float64
PTS         float64
dtype: object

In [None]:
all_stats_py.head()

NameError: ignored

In [None]:
# Rename Columns for postgres consumption
champs_copy = champs.rename(columns={"Ht.": "Ht", "Wt.": "Wt"})
champs_copy.dtypes
# Convert Pandas df to Pyspark df
champs_py = spark.createDataFrame(champs_copy)
# champs_py.dtypes
# # all_stats_py

Traceback (most recent call last):
  File "/content/spark-3.1.1-bin-hadoop2.7/python/pyspark/serializers.py", line 437, in dumps
    return cloudpickle.dumps(obj, pickle_protocol)
  File "/content/spark-3.1.1-bin-hadoop2.7/python/pyspark/cloudpickle/cloudpickle_fast.py", line 101, in dumps
    cp.dump(obj)
  File "/content/spark-3.1.1-bin-hadoop2.7/python/pyspark/cloudpickle/cloudpickle_fast.py", line 540, in dump
    return Pickler.dump(self, obj)
  File "/usr/lib/python3.7/pickle.py", line 437, in dump
    self.save(obj)
  File "/usr/lib/python3.7/pickle.py", line 504, in save
    f(self, obj) # Call unbound method with explicit self
  File "/usr/lib/python3.7/pickle.py", line 789, in save_tuple
    save(element)
  File "/usr/lib/python3.7/pickle.py", line 504, in save
    f(self, obj) # Call unbound method with explicit self
  File "/content/spark-3.1.1-bin-hadoop2.7/python/pyspark/cloudpickle/cloudpickle_fast.py", line 722, in save_function
    *self._dynamic_function_reduce(obj), 

PicklingError: ignored

In [None]:
# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://nba-champs.c6ka6apltccn.us-east-2.rds.amazonaws.com:5432/nbaChamps"
config = {"user":"postgres",
          "password": "MildredChase84!",
          "driver":"org.postgresql.Driver"}

In [None]:
# Write DataFrame to Nba champs table in RDS

champs_py.write.jdbc(url=jdbc_url, table='champs', mode=mode, properties=config)